/*
 * Falcon key pair generation.
 *
 * ==========================(LICENSE BEGIN)============================
 *
 * Copyright (c) 2017-2019  Falcon Project
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sublicense, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *
 * ===========================(LICENSE END)=============================
 *
 * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
 */

#include "inner.h"
#include "util.h"

#define MKN(logn)   ((size_t)1 << (logn))

/* ==================================================================== */
/*
 * Modular arithmetics.
 *
 * We implement a few functions for computing modulo a small integer p.
 *
 * All functions require that 2^30 < p < 2^31. Moreover, operands must
 * be in the 0..p-1 range.
 *
 * Modular addition and subtraction work for all such p.
 *
 * Montgomery multiplication requires that p is odd, and must be provided
 * with an additional value p0i = -1/p mod 2^31. See below for some basics
 * on Montgomery multiplication.
 *
 * Division computes an inverse modulo p by an exponentiation (with
 * exponent p-2): this works only if p is prime. Multiplication
 * requirements also apply, i.e. p must be odd and p0i must be provided.
 *
 * The NTT and inverse NTT need all of the above, and also that
 * p = 1 mod 2048.
 *
 * -----------------------------------------------------------------------
 *
 * We use Montgomery representation with 31-bit values:
 *
 *   Let R = 2^31 mod p. When 2^30 < p < 2^31, R = 2^31 - p.
 *   Montgomery representation of an integer x modulo p is x*R mod p.
 *
 *   Montgomery multiplication computes (x*y)/R mod p for
 *   operands x and y. Therefore:
 *
 *    - if operands are x*R and y*R (Montgomery representations of x and
 *      y), then Montgomery multiplication computes (x*R*y*R)/R = (x*y)*R
 *      mod p, which is the Montgomery representation of the product x*y;
 *
 *    - if operands are x*R and y (or x and y*R), then Montgomery
 *      multiplication returns x*y mod p: mixed-representation
 *      multiplications yield results in normal representation.
 *
 * To convert to Montgomery representation, we multiply by R, which is done
 * by Montgomery-multiplying by R^2. Stand-alone conversion back from
 * Montgomery representation is Montgomery-multiplication by 1.
 */

/*
 * Precomputed small primes. Each element contains the following:
 *
 *  p   The prime itself.
 *
 *  g   A primitive root of phi = X^N+1 (in field Z_p).
 *
 *  s   The inverse of the product of all previous primes in the array,
 *      computed modulo p and in Montgomery representation.
 *
 * All primes are such that p = 1 mod 2048, and are lower than 2^31. They
 * are listed in decreasing order.
 */

typedef struct {
    uint32_t p;
    uint32_t g;
    uint32_t s;
} small_prime;

static const small_prime PRIMES[] = {
    { 2147473409,  383167813,      10239 },
    { 2147389441,  211808905,  471403745 },
    { 2147387393,   37672282, 1329335065 },
    { 2147377153, 1977035326,  968223422 },
    { 2147358721, 1067163706,  132460015 },
    { 2147352577, 1606082042,  598693809 },
    { 2147346433, 2033915641, 1056257184 },
    { 2147338241, 1653770625,  421286710 },
    { 2147309569,  631200819, 1111201074 },
    { 2147297281, 2038364663, 1042003613 },
    { 2147295233, 1962540515,   19440033 },
    { 2147239937, 2100082663,  353296760 },
    { 2147235841, 1991153006, 1703918027 },
    { 2147217409,  516405114, 1258919613 },
    { 2147205121,  409347988, 1089726929 },
    { 2147196929,  927788991, 1946238668 },
    { 2147178497, 1136922411, 1347028164 },
    { 2147100673,  868626236,  701164723 },
    { 2147082241, 1897279176,  617820870 },
    { 2147074049, 1888819123,  158382189 },
    { 2147051521,   25006327,  522758543 },
    { 2147043329,  327546255,   37227845 },
    { 2147039233,  766324424, 1133356428 },
    { 2146988033, 1862817362,   73861329 },
    { 2146963457,  404622040,  653019435 },
    { 2146959361, 1936581214,  995143093 },
    { 2146938881, 1559770096,  634921513 },
    { 2146908161,  422623708, 1985060172 },
    { 2146885633, 1751189170,  298238186 },
    { 2146871297,  578919515,  291810829 },
    { 2146846721, 1114060353,  915902322 },
    { 2146834433, 2069565474,   47859524 },
    { 2146818049, 1552824584,  646281055 },
    { 2146775041, 1906267847, 1597832891 },
    { 2146756609, 1847414714, 1228090888 },
    { 2146744321, 1818792070, 1176377637 },
    { 2146738177, 1118066398, 1054971214 },
    { 2146736129,   52057278,  933422153 },
    { 2146713601,  592259376, 1406621510 },
    { 2146695169,  263161877, 1514178701 },
    { 2146656257,  685363115,  384505091 },
    { 2146650113,  927727032,  537575289 },
    { 2146646017,   52575506, 1799464037 },
    { 2146643969, 1276803876, 1348954416 },
    { 2146603009,  814028633, 1521547704 },
    { 2146572289, 1846678872, 1310832121 },
    { 2146547713,  919368090, 1019041349 },
    { 2146508801,  671847612,   38582496 },
    { 2146492417,  283911680,  532424562 },
    { 2146490369, 1780044827,  896447978 },
    { 2146459649,  327980850, 1327906900 },
    { 2146447361, 1310561493,  958645253 },
    { 2146441217,  412148926,  287271128 },
    { 2146437121,  293186449, 2009822534 },
    { 2146430977,  179034356, 1359155584 },
    { 2146418689, 1517345488, 1790248672 },
    { 2146406401, 1615820390, 1584833571 },
    { 2146404353,  826651445,  607120498 },
    { 2146379777,    3816988, 1897049071 },
    { 2146363393, 1221409784, 1986921567 },
    { 2146355201, 1388081168,  849968120 },
    { 2146336769, 1803473237, 1655544036 },
    { 2146312193, 1023484977,  273671831 },
    { 2146293761, 1074591448,  467406983 },
    { 2146283521,  831604668, 1523950494 },
    { 2146203649,  712865423, 1170834574 },
    { 2146154497, 1764991362, 1064856763 },
    { 2146142209,  627386213, 1406840151 },
    { 2146127873, 1638674429, 2088393537 },
    { 2146099201, 1516001018,  690673370 },
    { 2146093057, 1294931393,  315136610 },
    { 2146091009, 1942399533,  973539425 },
    { 2146078721, 1843461814, 2132275436 },
    { 2146060289, 1098740778,  360423481 },
    { 2146048001, 1617213232, 1951981294 },
    { 2146041857, 1805783169, 2075683489 },
    { 2146019329,  272027909, 1753219918 },
    { 2145986561, 1206530344, 2034028118 },
    { 2145976321, 1243769360, 1173377644 },
    { 2145964033,  887200839, 1281344586 },
    { 2145906689, 1651026455,  906178216 },
    { 2145875969, 1673238256, 1043521212 },
    { 2145871873, 1226591210, 1399796492 },
    { 2145841153, 1465353397, 1324527802 },
    { 2145832961, 1150638905,  554084759 },
    { 2145816577,  221601706,  427340863 },
    { 2145785857,  608896761,  316590738 },
    { 2145755137, 1712054942, 1684294304 },
    { 2145742849, 1302302867,  724873116 },
    { 2145728513,  516717693,  431671476 },
    { 2145699841,  524575579, 1619722537 },
    { 2145691649, 1925625239,  982974435 },
    { 2145687553,  463795662, 1293154300 },
    { 2145673217,  771716636,  881778029 },
    { 2145630209, 1509556977,  837364988 },
    { 2145595393,  229091856,  851648427 },
    { 2145587201, 1796903241,  635342424 },
    { 2145525761,  715310882, 1677228081 },
    { 2145495041, 1040930522,  200685896 },
    { 2145466369,  949804237, 1809146322 },
    { 2145445889, 1673903706,   95316881 },
    { 2145390593,  806941852, 1428671135 },
    { 2145372161, 1402525292,  159350694 },
    { 2145361921, 2124760298, 1589134749 },
    { 2145359873, 1217503067, 1561543010 },
    { 2145355777,  338341402,   83865711 },
    { 2145343489, 1381532164,  641430002 },
    { 2145325057, 1883895478, 1528469895 },
    { 2145318913, 1335370424,   65809740 },
    { 2145312769, 2000008042, 1919775760 },
    { 2145300481,  961450962, 1229540578 },
    { 2145282049,  910466767, 1964062701 },
    { 2145232897,  816527501,  450152063 },
    { 2145218561, 1435128058, 1794509700 },
    { 2145187841,   33505311, 1272467582 },
    { 2145181697,  269767433, 1380363849 },
    { 2145175553,   56386299, 1316870546 },
    { 2145079297, 2106880293, 1391797340 },
    { 2145021953, 1347906152,  720510798 },
    { 2145015809,  206769262, 1651459955 },
    { 2145003521, 1885513236, 1393381284 },
    { 2144960513, 1810381315,   31937275 },
    { 2144944129, 1306487838, 2019419520 },
    { 2144935937,   37304730, 1841489054 },
    { 2144894977, 1601434616,  157985831 },
    { 2144888833,   98749330, 2128592228 },
    { 2144880641, 1772327002, 2076128344 },
    { 2144864257, 1404514762, 2029969964 },
    { 2144827393,  801236594,  406627220 },
    { 2144806913,  349217443, 1501080290 },
    { 2144796673, 1542656776, 2084736519 },
    { 2144778241, 1210734884, 1746416203 },
    { 2144759809, 1146598851,  716464489 },
    { 2144757761,  286328400, 1823728177 },
    { 2144729089, 1347555695, 1836644881 },
    { 2144727041, 1795703790,  520296412 },
    { 2144696321, 1302475157,  852964281 },
    { 2144667649, 1075877614,  504992927 },
    { 2144573441,  198765808, 1617144982 },
    { 2144555009,  321528767,  155821259 },
    { 2144550913,  814139516, 1819937644 },
    { 2144536577,  571143206,  962942255 },
    { 2144524289, 1746733766,    2471321 },
    { 2144512001, 1821415077,  124190939 },
    { 2144468993,  917871546, 1260072806 },
    { 2144458753,  378417981, 1569240563 },
    { 2144421889,  175229668, 1825620763 },
    { 2144409601, 1699216963,  351648117 },
    { 2144370689, 1071885991,  958186029 },
    { 2144348161, 1763151227,  540353574 },
    { 2144335873, 1060214804,  919598847 },
    { 2144329729,  663515846, 1448552668 },
    { 2144327681, 1057776305,  590222840 },
    { 2144309249, 1705149168, 1459294624 },
    { 2144296961,  325823721, 1649016934 },
    { 2144290817,  738775789,  447427206 },
    { 2144243713,  962347618,  893050215 },
    { 2144237569, 1655257077,  900860862 },
    { 2144161793,  242206694, 1567868672 },
    { 2144155649,  769415308, 1247993134 },
    { 2144137217,  320492023,  515841070 },
    { 2144120833, 1639388522,  770877302 },
    { 2144071681, 1761785233,  964296120 },
    { 2144065537,  419817825,  204564472 },
    { 2144028673,  666050597, 2091019760 },
    { 2144010241, 1413657615, 1518702610 },
    { 2143952897, 1238327946,  475672271 },
    { 2143940609,  307063413, 1176750846 },
    { 2143918081, 2062905559,  786785803 },
    { 2143899649, 1338112849, 1562292083 },
    { 2143891457,   68149545,   87166451 },
    { 2143885313,  921750778,  394460854 },
    { 2143854593,  719766593,  133877196 },
    { 2143836161, 1149399850, 1861591875 },
    { 2143762433, 1848739366, 1335934145 },
    { 2143756289, 1326674710,  102999236 },
    { 2143713281,  808061791, 1156900308 },
    { 2143690753,  388399459, 1926468019 },
    { 2143670273, 1427891374, 1756689401 },
    { 2143666177, 1912173949,  986629565 },
    { 2143645697, 2041160111,  371842865 },
    { 2143641601, 1279906897, 2023974350 },
    { 2143635457,  720473174, 1389027526 },
    { 2143621121, 1298309455, 1732632006 },
    { 2143598593, 1548762216, 1825417506 },
    { 2143567873,  620475784, 1073787233 },
    { 2143561729, 1932954575,  949167309 },
    { 2143553537,  354315656, 1652037534 },
    { 2143541249,  577424288, 1097027618 },
    { 2143531009,  357862822,  478640055 },
    { 2143522817, 2017706025, 1550531668 },
    { 2143506433, 2078127419, 1824320165 },
    { 2143488001,  613475285, 1604011510 },
    { 2143469569, 1466594987,  502095196 },
    { 2143426561, 1115430331, 1044637111 },
    { 2143383553,    9778045, 1902463734 },
    { 2143377409, 1557401276, 2056861771 },
    { 2143363073,  652036455, 1965915971 },
    { 2143260673, 1464581171, 1523257541 },
    { 2143246337, 1876119649,  764541916 },
    { 2143209473, 1614992673, 1920672844 },
    { 2143203329,  981052047, 2049774209 },
    { 2143160321, 1847355533,  728535665 },
    { 2143129601,  965558457,  603052992 },
    { 2143123457, 2140817191,    8348679 },
    { 2143100929, 1547263683,  694209023 },
    { 2143092737,  643459066, 1979934533 },
    { 2143082497,  188603778, 2026175670 },
    { 2143062017, 1657329695,  377451099 },
    { 2143051777,  114967950,  979255473 },
    { 2143025153, 1698431342, 1449196896 },
    { 2143006721, 1862741675, 1739650365 },
    { 2142996481,  756660457,  996160050 },
    { 2142976001,  927864010, 1166847574 },
    { 2142965761,  905070557,  661974566 },
    { 2142916609,   40932754, 1787161127 },
    { 2142892033, 1987985648,  675335382 },
    { 2142885889,  797497211, 1323096997 },
    { 2142871553, 2068025830, 1411877159 },
    { 2142861313, 1217177090, 1438410687 },
    { 2142830593,  409906375, 1767860634 },
    { 2142803969, 1197788993,  359782919 },
    { 2142785537,  643817365,  513932862 },
    { 2142779393, 1717046338,  218943121 },
    { 2142724097,   89336830,  416687049 },
    { 2142707713,    5944581, 1356813523 },
    { 2142658561,  887942135, 2074011722 },
    { 2142638081,  151851972, 1647339939 },
    { 2142564353, 1691505537, 1483107336 },
    { 2142533633, 1989920200, 1135938817 },
    { 2142529537,  959263126, 1531961857 },
    { 2142527489,  453251129, 1725566162 },
    { 2142502913, 1536028102,  182053257 },
    { 2142498817,  570138730,  701443447 },
    { 2142416897,  326965800,  411931819 },
    { 2142363649, 1675665410, 1517191733 },
    { 2142351361,  968529566, 1575712703 },
    { 2142330881, 1384953238, 1769087884 },
    { 2142314497, 1977173242, 1833745524 },
    { 2142289921,   95082313, 1714775493 },
    { 2142283777,  109377615, 1070584533 },
    { 2142277633,   16960510,  702157145 },
    { 2142263297,  553850819,  431364395 },
    { 2142208001,  241466367, 2053967982 },
    { 2142164993, 1795661326, 1031836848 },
    { 2142097409, 1212530046,  712772031 },
    { 2142087169, 1763869720,  822276067 },
    { 2142078977,  644065713, 1765268066 },
    { 2142074881,  112671944,  643204925 },
    { 2142044161, 1387785471, 1297890174 },
    { 2142025729,  783885537, 1000425730 },
    { 2142011393,  905662232, 1679401033 },
    { 2141974529,  799788433,  468119557 },
    { 2141943809, 1932544124,  449305555 },
    { 2141933569, 1527403256,  841867925 },
    { 2141931521, 1247076451,  743823916 },
    { 2141902849, 1199660531,  401687910 },
    { 2141890561,  150132350, 1720336972 },
    { 2141857793, 1287438162,  663880489 },
    { 2141833217,  618017731, 1819208266 },
    { 2141820929,  999578638, 1403090096 },
    { 2141786113,   81834325, 1523542501 },
    { 2141771777,  120001928,  463556492 },
    { 2141759489,  122455485, 2124928282 },
    { 2141749249,  141986041,  940339153 },
    { 2141685761,  889088734,  477141499 },
    { 2141673473,  324212681, 1122558298 },
    { 2141669377, 1175806187, 1373818177 },
    { 2141655041, 1113654822,  296887082 },
    { 2141587457,  991103258, 1585913875 },
    { 2141583361, 1401451409, 1802457360 },
    { 2141575169, 1571977166,  712760980 },
    { 2141546497, 1107849376, 1250270109 },
    { 2141515777,  196544219,  356001130 },
    { 2141495297, 1733571506, 1060744866 },
    { 2141483009,  321552363, 1168297026 },
    { 2141458433,  505818251,  733225819 },
    { 2141360129, 1026840098,  948342276 },
    { 2141325313,  945133744, 2129965998 },
    { 2141317121, 1871100260, 1843844634 },
    { 2141286401, 1790639498, 1750465696 },
    { 2141267969, 1376858592,  186160720 },
    { 2141255681, 2129698296, 1876677959 },
    { 2141243393, 2138900688, 1340009628 },
    { 2141214721, 1933049835, 1087819477 },
    { 2141212673, 1898664939, 1786328049 },
    { 2141202433,  990234828,  940682169 },
    { 2141175809, 1406392421,  993089586 },
    { 2141165569, 1263518371,  289019479 },
    { 2141073409, 1485624211,  507864514 },
    { 2141052929, 1885134788,  311252465 },
    { 2141040641, 1285021247,  280941862 },
    { 2141028353, 1527610374,  375035110 },
    { 2141011969, 1400626168,  164696620 },
    { 2140999681,  632959608,  966175067 },
    { 2140997633, 2045628978, 1290889438 },
    { 2140993537, 1412755491,  375366253 },
    { 2140942337,  719477232,  785367828 },
    { 2140925953,   45224252,  836552317 },
    { 2140917761, 1157376588, 1001839569 },
    { 2140887041,  278480752, 2098732796 },
    { 2140837889, 1663139953,  924094810 },
    { 2140788737,  802501511, 2045368990 },
    { 2140766209, 1820083885, 1800295504 },
    { 2140764161, 1169561905, 2106792035 },
    { 2140696577,  127781498, 1885987531 },
    { 2140684289,   16014477, 1098116827 },
    { 2140653569,  665960598, 1796728247 },
    { 2140594177, 1043085491,  377310938 },
    { 2140579841, 1732838211, 1504505945 },
    { 2140569601,  302071939,  358291016 },
    { 2140567553,  192393733, 1909137143 },
    { 2140557313,  406595731, 1175330270 },
    { 2140549121, 1748850918,  525007007 },
    { 2140477441,  499436566, 1031159814 },
    { 2140469249, 1886004401, 1029951320 },
    { 2140426241, 1483168100, 1676273461 },
    { 2140420097, 1779917297,  846024476 },
    { 2140413953,  522948893, 1816354149 },
    { 2140383233, 1931364473, 1296921241 },
    { 2140366849, 1917356555,  147196204 },
    { 2140354561,   16466177, 1349052107 },
    { 2140348417, 1875366972, 1860485634 },
    { 2140323841,  456498717, 1790256483 },
    { 2140321793, 1629493973,  150031888 },
    { 2140315649, 1904063898,  395510935 },
    { 2140280833, 1784104328,  831417909 },
    { 2140250113,  256087139,  697349101 },
    { 2140229633,  388553070,  243875754 },
    { 2140223489,  747459608, 1396270850 },
    { 2140200961,  507423743, 1895572209 },
    { 2140162049,  580106016, 2045297469 },
    { 2140149761,  712426444,  785217995 },
    { 2140137473, 1441607584,  536866543 },
    { 2140119041,  346538902, 1740434653 },
    { 2140090369,  282642885,   21051094 },
    { 2140076033, 1407456228,  319910029 },
    { 2140047361, 1619330500, 1488632070 },
    { 2140041217, 2089408064, 2012026134 },
    { 2140008449, 1705524800, 1613440760 },
    { 2139924481, 1846208233, 1280649481 },
    { 2139906049,  989438755, 1185646076 },
    { 2139867137, 1522314850,  372783595 },
    { 2139842561, 1681587377,  216848235 },
    { 2139826177, 2066284988, 1784999464 },
    { 2139824129,  480888214, 1513323027 },
    { 2139789313,  847937200,  858192859 },
    { 2139783169, 1642000434, 1583261448 },
    { 2139770881,  940699589,  179702100 },
    { 2139768833,  315623242,  964612676 },
    { 2139666433,  331649203,  764666914 },
    { 2139641857, 2118730799, 1313764644 },
    { 2139635713,  519149027,  519212449 },
    { 2139598849, 1526413634, 1769667104 },
    { 2139574273,  551148610,  820739925 },
    { 2139568129, 1386800242,  472447405 },
    { 2139549697,  813760130, 1412328531 },
    { 2139537409, 1615286260, 1609362979 },
    { 2139475969, 1352559299, 1696720421 },
    { 2139455489, 1048691649, 1584935400 },
    { 2139432961,  836025845,  950121150 },
    { 2139424769, 1558281165, 1635486858 },
    { 2139406337, 1728402143, 1674423301 },
    { 2139396097, 1727715782, 1483470544 },
    { 2139383809, 1092853491, 1741699084 },
    { 2139369473,  690776899, 1242798709 },
    { 2139351041, 1768782380, 2120712049 },
    { 2139334657, 1739968247, 1427249225 },
    { 2139332609, 1547189119,  623011170 },
    { 2139310081, 1346827917, 1605466350 },
    { 2139303937,  369317948,  828392831 },
    { 2139301889, 1560417239, 1788073219 },
    { 2139283457, 1303121623,  595079358 },
    { 2139248641, 1354555286,  573424177 },
    { 2139240449,   60974056,  885781403 },
    { 2139222017,  355573421, 1221054839 },
    { 2139215873,  566477826, 1724006500 },
    { 2139150337,  871437673, 1609133294 },
    { 2139144193, 1478130914, 1137491905 },
    { 2139117569, 1854880922,  964728507 },
    { 2139076609,  202405335,  756508944 },
    { 2139062273, 1399715741,  884826059 },
    { 2139045889, 1051045798, 1202295476 },
    { 2139033601, 1707715206,  632234634 },
    { 2139006977, 2035853139,  231626690 },
    { 2138951681,  183867876,  838350879 },
    { 2138945537, 1403254661,  404460202 },
    { 2138920961,  310865011, 1282911681 },
    { 2138910721, 1328496553,  103472415 },
    { 2138904577,   78831681,  993513549 },
    { 2138902529, 1319697451, 1055904361 },
    { 2138816513,  384338872, 1706202469 },
    { 2138810369, 1084868275,  405677177 },
    { 2138787841,  401181788, 1964773901 },
    { 2138775553, 1850532988, 1247087473 },
    { 2138767361,  874261901, 1576073565 },
    { 2138757121, 1187474742,  993541415 },
    { 2138748929, 1782458888, 1043206483 },
    { 2138744833, 1221500487,  800141243 },
    { 2138738689,  413465368, 1450660558 },
    { 2138695681,  739045140,  342611472 },
    { 2138658817, 1355845756,  672674190 },
    { 2138644481,  608379162, 1538874380 },
    { 2138632193, 1444914034,  686911254 },
    { 2138607617,  484707818, 1435142134 },
    { 2138591233,  539460669, 1290458549 },
    { 2138572801, 2093538990, 2011138646 },
    { 2138552321, 1149786988, 1076414907 },
    { 2138546177,  840688206, 2108985273 },
    { 2138533889,  209669619,  198172413 },
    { 2138523649, 1975879426, 1277003968 },
    { 2138490881, 1351891144, 1976858109 },
    { 2138460161, 1817321013, 1979278293 },
    { 2138429441, 1950077177,  203441928 },
    { 2138400769,  908970113,  628395069 },
    { 2138398721,  219890864,  758486760 },
    { 2138376193, 1306654379,  977554090 },
    { 2138351617,  298822498, 2004708503 },
    { 2138337281,  441457816, 1049002108 },
    { 2138320897, 1517731724, 1442269609 },
    { 2138290177, 1355911197, 1647139103 },
    { 2138234881,  531313247, 1746591962 },
    { 2138214401, 1899410930,  781416444 },
    { 2138202113, 1813477173, 1622508515 },
    { 2138191873, 1086458299, 1025408615 },
    { 2138183681, 1998800427,  827063290 },
    { 2138173441, 1921308898,  749670117 },
    { 2138103809, 1620902804, 2126787647 },
    { 2138099713,  828647069, 1892961817 },
    { 2138085377,  179405355, 1525506535 },
    { 2138060801,  615683235, 1259580138 },
    { 2138044417, 2030277840, 1731266562 },
    { 2138042369, 2087222316, 1627902259 },
    { 2138032129,  126388712, 1108640984 },
    { 2138011649,  715026550, 1017980050 },
    { 2137993217, 1693714349, 1351778704 },
    { 2137888769, 1289762259, 1053090405 },
    { 2137853953,  199991890, 1254192789 },
    { 2137833473,  941421685,  896995556 },
    { 2137817089,  750416446, 1251031181 },
    { 2137792513,  798075119,  368077456 },
    { 2137786369,  878543495, 1035375025 },
    { 2137767937,    9351178, 1156563902 },
    { 2137755649, 1382297614, 1686559583 },
    { 2137724929, 1345472850, 1681096331 },
    { 2137704449,  834666929,  630551727 },
    { 2137673729, 1646165729, 1892091571 },
    { 2137620481,  778943821,   48456461 },
    { 2137618433, 1730837875, 1713336725 },
    { 2137581569,  805610339, 1378891359 },
    { 2137538561,  204342388, 1950165220 },
    { 2137526273, 1947629754, 1500789441 },
    { 2137516033,  719902645, 1499525372 },
    { 2137491457,  230451261,  556382829 },
    { 2137440257,  979573541,  412760291 },
    { 2137374721,  927841248, 1954137185 },
    { 2137362433, 1243778559,  861024672 },
    { 2137313281, 1341338501,  980638386 },
    { 2137311233,  937415182, 1793212117 },
    { 2137255937,  795331324, 1410253405 },
    { 2137243649,  150756339, 1966999887 },
    { 2137182209,  163346914, 1939301431 },
    { 2137171969, 1952552395,  758913141 },
    { 2137159681,  570788721,  218668666 },
    { 2137147393, 1896656810, 2045670345 },
    { 2137141249,  358493842,  518199643 },
    { 2137139201, 1505023029,  674695848 },
    { 2137133057,   27911103,  830956306 },
    { 2137122817,  439771337, 1555268614 },
    { 2137116673,  790988579, 1871449599 },
    { 2137110529,  432109234,  811805080 },
    { 2137102337, 1357900653, 1184997641 },
    { 2137098241,  515119035, 1715693095 },
    { 2137090049,  408575203, 2085660657 },
    { 2137085953, 2097793407, 1349626963 },
    { 2137055233, 1556739954, 1449960883 },
    { 2137030657, 1545758650, 1369303716 },
    { 2136987649,  332602570,  103875114 },
    { 2136969217, 1499989506, 1662964115 },
    { 2136924161,  857040753,    4738842 },
    { 2136895489, 1948872712,  570436091 },
    { 2136893441,   58969960, 1568349634 },
    { 2136887297, 2127193379,  273612548 },
    { 2136850433,  111208983, 1181257116 },
    { 2136809473, 1627275942, 1680317971 },
    { 2136764417, 1574888217,   14011331 },
    { 2136741889,   14011055, 1129154251 },
    { 2136727553,   35862563, 1838555253 },
    { 2136721409,  310235666, 1363928244 },
    { 2136698881, 1612429202, 1560383828 },
    { 2136649729, 1138540131,  800014364 },
    { 2136606721,  602323503, 1433096652 },
    { 2136563713,  182209265, 1919611038 },
    { 2136555521,  324156477,  165591039 },
    { 2136549377,  195513113,  217165345 },
    { 2136526849, 1050768046,  939647887 },
    { 2136508417, 1886286237, 1619926572 },
    { 2136477697,  609647664,   35065157 },
    { 2136471553,  679352216, 1452259468 },
    { 2136457217,  128630031,  824816521 },
    { 2136422401,   19787464, 1526049830 },
    { 2136420353,  698316836, 1530623527 },
    { 2136371201, 1651862373, 1804812805 },
    { 2136334337,  326596005,  336977082 },
    { 2136322049,   63253370, 1904972151 },
    { 2136297473,  312176076,  172182411 },
    { 2136248321,  381261841,  369032670 },
    { 2136242177,  358688773, 1640007994 },
    { 2136229889,  512677188,   75585225 },
    { 2136219649, 2095003250, 1970086149 },
    { 2136207361, 1909650722,  537760675 },
    { 2136176641, 1334616195, 1533487619 },
    { 2136158209, 2096285632, 1793285210 },
    { 2136143873, 1897347517,  293843959 },
    { 2136133633,  923586222, 1022655978 },
    { 2136096769, 1464868191, 1515074410 },
    { 2136094721, 2020679520, 2061636104 },
    { 2136076289,  290798503, 1814726809 },
    { 2136041473,  156415894, 1250757633 },
    { 2135996417,  297459940, 1132158924 },
    { 2135955457,  538755304, 1688831340 },
    { 0, 0, 0 }
};

/*
 * Reduce a small signed integer modulo a small prime. The source
 * value x MUST be such that -p < x < p.
 */
static inline uint32_t
modp_set(int32_t x, uint32_t p) {
    uint32_t w;

    w = (uint32_t)x;
    w += p & -(w >> 31);
    return w;
}

/*
 * Normalize a modular integer around 0.
 */
static inline int32_t
modp_norm(uint32_t x, uint32_t p) {
    return (int32_t)(x - (p & (((x - ((p + 1) >> 1)) >> 31) - 1)));
}

/*
 * Compute -1/p mod 2^31. This works for all odd integers p that fit
 * on 31 bits.
 */
static uint32_t
modp_ninv31(uint32_t p) {
    uint32_t y;

    y = 2 - p;
    y *= 2 - p * y;
    y *= 2 - p * y;
    y *= 2 - p * y;
    y *= 2 - p * y;
    return (uint32_t)0x7FFFFFFF & -y;
}

/*
 * Compute R = 2^31 mod p.
 */
static inline uint32_t
modp_R(uint32_t p) {
    /*
     * Since 2^30 < p < 2^31, we know that 2^31 mod p is simply
     * 2^31 - p.
     */
    return ((uint32_t)1 << 31) - p;
}

/*
 * Addition modulo p.
 */
static inline uint32_t
modp_add(uint32_t a, uint32_t b, uint32_t p) {
    uint32_t d;

    d = a + b - p;
    d += p & -(d >> 31);
    return d;
}

/*
 * Subtraction modulo p.
 */
static inline uint32_t
modp_sub(uint32_t a, uint32_t b, uint32_t p) {
    uint32_t d;

    d = a - b;
    d += p & -(d >> 31);
    return d;
}

/*
 * Halving modulo p.
 */
/* unused
static inline uint32_t
modp_half(uint32_t a, uint32_t p)
{
    a += p & -(a & 1);
    return a >> 1;
}
*/

/*
 * Montgomery multiplication modulo p. The 'p0i' value is -1/p mod 2^31.
 * It is required that p is an odd integer.
 */
static inline uint32_t
modp_montymul(uint32_t a, uint32_t b, uint32_t p, uint32_t p0i) {
    uint64_t z, w;
    uint32_t d;

    z = (uint64_t)a * (uint64_t)b;
    w = ((z * p0i) & (uint64_t)0x7FFFFFFF) * p;
    d = (uint32_t)((z + w) >> 31) - p;
    d += p & -(d >> 31);
    return d;
}

/*
 * Compute R2 = 2^62 mod p.
 */
static uint32_t
modp_R2(uint32_t p, uint32_t p0i) {
    uint32_t z;

    /*
     * Compute z = 2^31 mod p (this is the value 1 in Montgomery
     * representation), then double it with an addition.
     */
    z = modp_R(p);
    z = modp_add(z, z, p);

    /*
     * Square it five times to obtain 2^32 in Montgomery representation
     * (i.e. 2^63 mod p).
     */
    z = modp_montymul(z, z, p, p0i);
    z = modp_montymul(z, z, p, p0i);
    z = modp_montymul(z, z, p, p0i);
    z = modp_montymul(z, z, p, p0i);
    z = modp_montymul(z, z, p, p0i);

    /*
     * Halve the value mod p to get 2^62.
     */
    z = (z + (p & -(z & 1))) >> 1;
    return z;
}

/*
 * Compute 2^(31*x) modulo p. This works for integers x up to 2^11.
 * p must be prime such that 2^30 < p < 2^31; p0i must be equal to
 * -1/p mod 2^31; R2 must be equal to 2^62 mod p.
 */
static inline uint32_t
modp_Rx(unsigned x, uint32_t p, uint32_t p0i, uint32_t R2) {
    int i;
    uint32_t r, z;

    /*
     * 2^(31*x) = (2^31)*(2^(31*(x-1))); i.e. we want the Montgomery
     * representation of (2^31)^e mod p, where e = x-1.
     * R2 is 2^31 in Montgomery representation.
     */
    x --;
    r = R2;
    z = modp_R(p);
    for (i = 0; (1U << i) <= x; i ++) {
        if ((x & (1U << i)) != 0) {
            z = modp_montymul(z, r, p, p0i);
        }
        r = modp_montymul(r, r, p, p0i);
    }
    return z;
}

/*
 * Division modulo p. If the divisor (b) is 0, then 0 is returned.
 * This function computes proper results only when p is prime.
 * Parameters:
 *   a     dividend
 *   b     divisor
 *   p     odd prime modulus
 *   p0i   -1/p mod 2^31
 *   R     2^31 mod R
 */
static uint32_t
modp_div(uint32_t a, uint32_t b, uint32_t p, uint32_t p0i, uint32_t R) {
    uint32_t z, e;
    int i;

    e = p - 2;
    z = R;
    for (i = 30; i >= 0; i --) {
        uint32_t z2;

        z = modp_montymul(z, z, p, p0i);
        z2 = modp_montymul(z, b, p, p0i);
        z ^= (z ^ z2) & -(uint32_t)((e >> i) & 1);
    }

    /*
     * The loop above just assumed that b was in Montgomery
     * representation, i.e. really contained b*R; under that
     * assumption, it returns 1/b in Montgomery representation,
     * which is R/b. But we gave it b in normal representation,
     * so the loop really returned R/(b/R) = R^2/b.
     *
     * We want a/b, so we need one Montgomery multiplication with a,
     * which also remove one of the R factors, and another such
     * multiplication to remove the second R factor.
     */
    z = modp_montymul(z, 1, p, p0i);
    return modp_montymul(a, z, p, p0i);
}

/*
 * Bit-reversal index table.
 */
static const uint16_t REV10[] = {
    0,  512,  256,  768,  128,  640,  384,  896,   64,  576,  320,  832,
    192,  704,  448,  960,   32,  544,  288,  800,  160,  672,  416,  928,
    96,  608,  352,  864,  224,  736,  480,  992,   16,  528,  272,  784,
    144,  656,  400,  912,   80,  592,  336,  848,  208,  720,  464,  976,
    48,  560,  304,  816,  176,  688,  432,  944,  112,  624,  368,  880,
    240,  752,  496, 1008,    8,  520,  264,  776,  136,  648,  392,  904,
    72,  584,  328,  840,  200,  712,  456,  968,   40,  552,  296,  808,
    168,  680,  424,  936,  104,  616,  360,  872,  232,  744,  488, 1000,
    24,  536,  280,  792,  152,  664,  408,  920,   88,  600,  344,  856,
    216,  728,  472,  984,   56,  568,  312,  824,  184,  696,  440,  952,
    120,  632,  376,  888,  248,  760,  504, 1016,    4,  516,  260,  772,
    132,  644,  388,  900,   68,  580,  324,  836,  196,  708,  452,  964,
    36,  548,  292,  804,  164,  676,  420,  932,  100,  612,  356,  868,
    228,  740,  484,  996,   20,  532,  276,  788,  148,  660,  404,  916,
    84,  596,  340,  852,  212,  724,  468,  980,   52,  564,  308,  820,
    180,  692,  436,  948,  116,  628,  372,  884,  244,  756,  500, 1012,
    12,  524,  268,  780,  140,  652,  396,  908,   76,  588,  332,  844,
    204,  716,  460,  972,   44,  556,  300,  812,  172,  684,  428,  940,
    108,  620,  364,  876,  236,  748,  492, 1004,   28,  540,  284,  796,
    156,  668,  412,  924,   92,  604,  348,  860,  220,  732,  476,  988,
    60,  572,  316,  828,  188,  700,  444,  956,  124,  636,  380,  892,
    252,  764,  508, 1020,    2,  514,  258,  770,  130,  642,  386,  898,
    66,  578,  322,  834,  194,  706,  450,  962,   34,  546,  290,  802,
    162,  674,  418,  930,   98,  610,  354,  866,  226,  738,  482,  994,
    18,  530,  274,  786,  146,  658,  402,  914,   82,  594,  338,  850,
    210,  722,  466,  978,   50,  562,  306,  818,  178,  690,  434,  946,
    114,  626,  370,  882,  242,  754,  498, 1010,   10,  522,  266,  778,
    138,  650,  394,  906,   74,  586,  330,  842,  202,  714,  458,  970,
    42,  554,  298,  810,  170,  682,  426,  938,  106,  618,  362,  874,
    234,  746,  490, 1002,   26,  538,  282,  794,  154,  666,  410,  922,
    90,  602,  346,  858,  218,  730,  474,  986,   58,  570,  314,  826,
    186,  698,  442,  954,  122,  634,  378,  890,  250,  762,  506, 1018,
    6,  518,  262,  774,  134,  646,  390,  902,   70,  582,  326,  838,
    198,  710,  454,  966,   38,  550,  294,  806,  166,  678,  422,  934,
    102,  614,  358,  870,  230,  742,  486,  998,   22,  534,  278,  790,
    150,  662,  406,  918,   86,  598,  342,  854,  214,  726,  470,  982,
    54,  566,  310,  822,  182,  694,  438,  950,  118,  630,  374,  886,
    246,  758,  502, 1014,   14,  526,  270,  782,  142,  654,  398,  910,
    78,  590,  334,  846,  206,  718,  462,  974,   46,  558,  302,  814,
    174,  686,  430,  942,  110,  622,  366,  878,  238,  750,  494, 1006,
    30,  542,  286,  798,  158,  670,  414,  926,   94,  606,  350,  862,
    222,  734,  478,  990,   62,  574,  318,  830,  190,  702,  446,  958,
    126,  638,  382,  894,  254,  766,  510, 1022,    1,  513,  257,  769,
    129,  641,  385,  897,   65,  577,  321,  833,  193,  705,  449,  961,
    33,  545,  289,  801,  161,  673,  417,  929,   97,  609,  353,  865,
    225,  737,  481,  993,   17,  529,  273,  785,  145,  657,  401,  913,
    81,  593,  337,  849,  209,  721,  465,  977,   49,  561,  305,  817,
    177,  689,  433,  945,  113,  625,  369,  881,  241,  753,  497, 1009,
    9,  521,  265,  777,  137,  649,  393,  905,   73,  585,  329,  841,
    201,  713,  457,  969,   41,  553,  297,  809,  169,  681,  425,  937,
    105,  617,  361,  873,  233,  745,  489, 1001,   25,  537,  281,  793,
    153,  665,  409,  921,   89,  601,  345,  857,  217,  729,  473,  985,
    57,  569,  313,  825,  185,  697,  441,  953,  121,  633,  377,  889,
    249,  761,  505, 1017,    5,  517,  261,  773,  133,  645,  389,  901,
    69,  581,  325,  837,  197,  709,  453,  965,   37,  549,  293,  805,
    165,  677,  421,  933,  101,  613,  357,  869,  229,  741,  485,  997,
    21,  533,  277,  789,  149,  661,  405,  917,   85,  597,  341,  853,
    213,  725,  469,  981,   53,  565,  309,  821,  181,  693,  437,  949,
    117,  629,  373,  885,  245,  757,  501, 1013,   13,  525,  269,  781,
    141,  653,  397,  909,   77,  589,  333,  845,  205,  717,  461,  973,
    45,  557,  301,  813,  173,  685,  429,  941,  109,  621,  365,  877,
    237,  749,  493, 1005,   29,  541,  285,  797,  157,  669,  413,  925,
    93,  605,  349,  861,  221,  733,  477,  989,   61,  573,  317,  829,
    189,  701,  445,  957,  125,  637,  381,  893,  253,  765,  509, 1021,
    3,  515,  259,  771,  131,  643,  387,  899,   67,  579,  323,  835,
    195,  707,  451,  963,   35,  547,  291,  803,  163,  675,  419,  931,
    99,  611,  355,  867,  227,  739,  483,  995,   19,  531,  275,  787,
    147,  659,  403,  915,   83,  595,  339,  851,  211,  723,  467,  979,
    51,  563,  307,  819,  179,  691,  435,  947,  115,  627,  371,  883,
    243,  755,  499, 1011,   11,  523,  267,  779,  139,  651,  395,  907,
    75,  587,  331,  843,  203,  715,  459,  971,   43,  555,  299,  811,
    171,  683,  427,  939,  107,  619,  363,  875,  235,  747,  491, 1003,
    27,  539,  283,  795,  155,  667,  411,  923,   91,  603,  347,  859,
    219,  731,  475,  987,   59,  571,  315,  827,  187,  699,  443,  955,
    123,  635,  379,  891,  251,  763,  507, 1019,    7,  519,  263,  775,
    135,  647,  391,  903,   71,  583,  327,  839,  199,  711,  455,  967,
    39,  551,  295,  807,  167,  679,  423,  935,  103,  615,  359,  871,
    231,  743,  487,  999,   23,  535,  279,  791,  151,  663,  407,  919,
    87,  599,  343,  855,  215,  727,  471,  983,   55,  567,  311,  823,
    183,  695,  439,  951,  119,  631,  375,  887,  247,  759,  503, 1015,
    15,  527,  271,  783,  143,  655,  399,  911,   79,  591,  335,  847,
    207,  719,  463,  975,   47,  559,  303,  815,  175,  687,  431,  943,
    111,  623,  367,  879,  239,  751,  495, 1007,   31,  543,  287,  799,
    159,  671,  415,  927,   95,  607,  351,  863,  223,  735,  479,  991,
    63,  575,  319,  831,  191,  703,  447,  959,  127,  639,  383,  895,
    255,  767,  511, 1023
};

/*
 * Compute the roots for NTT and inverse NTT (binary case). Input
 * parameter g is a primitive 2048-th root of 1 modulo p (i.e. g^1024 =
 * -1 mod p). This fills gm[] and igm[] with powers of g and 1/g:
 *   gm[rev(i)] = g^i mod p
 *   igm[rev(i)] = (1/g)^i mod p
 * where rev() is the "bit reversal" function over 10 bits. It fills
 * the arrays only up to N = 2^logn values.
 *
 * The values stored in gm[] and igm[] are in Montgomery representation.
 *
 * p must be a prime such that p = 1 mod 2048.
 */
static void
modp_mkgm2(uint32_t *restrict gm, uint32_t *restrict igm, unsigned logn,
           uint32_t g, uint32_t p, uint32_t p0i) {
    size_t u, n;
    unsigned k;
    uint32_t ig, x1, x2, R2;

    n = (size_t)1 << logn;

    /*
     * We want g such that g^(2N) = 1 mod p, but the provided
     * generator has order 2048. We must square it a few times.
     */
    R2 = modp_R2(p, p0i);
    g = modp_montymul(g, R2, p, p0i);
    for (k = logn; k < 10; k ++) {
        g = modp_montymul(g, g, p, p0i);
    }

    ig = modp_div(R2, g, p, p0i, modp_R(p));
    k = 10 - logn;
    x1 = x2 = modp_R(p);
    for (u = 0; u < n; u ++) {
        size_t v;

        v = REV10[u << k];
        gm[v] = x1;
        igm[v] = x2;
        x1 = modp_montymul(x1, g, p, p0i);
        x2 = modp_montymul(x2, ig, p, p0i);
    }
}

/*
 * Compute the NTT over a polynomial (binary case). Polynomial elements
 * are a[0], a[stride], a[2 * stride]...
 */
static void
modp_NTT2_ext(uint32_t *a, size_t stride, const uint32_t *gm, unsigned logn,
              uint32_t p, uint32_t p0i) {
    size_t t, m, n;

    if (logn == 0) {
        return;
    }
    n = (size_t)1 << logn;
    t = n;
    for (m = 1; m < n; m <<= 1) {
        size_t ht, u, v1;

        ht = t >> 1;
        for (u = 0, v1 = 0; u < m; u ++, v1 += t) {
            uint32_t s;
            size_t v;
            uint32_t *r1, *r2;

            s = gm[m + u];
            r1 = a + v1 * stride;
            r2 = r1 + ht * stride;
            for (v = 0; v < ht; v ++, r1 += stride, r2 += stride) {
                uint32_t x, y;

                x = *r1;
                y = modp_montymul(*r2, s, p, p0i);
                *r1 = modp_add(x, y, p);
                *r2 = modp_sub(x, y, p);
            }
        }
        t = ht;
    }
}

/*
 * Compute the inverse NTT over a polynomial (binary case).
 */
static void
modp_iNTT2_ext(uint32_t *a, size_t stride, const uint32_t *igm, unsigned logn,
               uint32_t p, uint32_t p0i) {
    size_t t, m, n, k;
    uint32_t ni;
    uint32_t *r;

    if (logn == 0) {
        return;
    }
    n = (size_t)1 << logn;
    t = 1;
    for (m = n; m > 1; m >>= 1) {
        size_t hm, dt, u, v1;

        hm = m >> 1;
        dt = t << 1;
        for (u = 0, v1 = 0; u < hm; u ++, v1 += dt) {
            uint32_t s;
            size_t v;
            uint32_t *r1, *r2;

            s = igm[hm + u];
            r1 = a + v1 * stride;
            r2 = r1 + t * stride;
            for (v = 0; v < t; v ++, r1 += stride, r2 += stride) {
                uint32_t x, y;

                x = *r1;
                y = *r2;
                *r1 = modp_add(x, y, p);
                *r2 = modp_montymul(
                          modp_sub(x, y, p), s, p, p0i);;
            }
        }
        t = dt;
    }

    /*
     * We need 1/n in Montgomery representation, i.e. R/n. Since
     * 1 <= logn <= 10, R/n is an integer; morever, R/n <= 2^30 < p,
     * thus a simple shift will do.
     */
    ni = (uint32_t)1 << (31 - logn);
    for (k = 0, r = a; k < n; k ++, r += stride) {
        *r = modp_montymul(*r, ni, p, p0i);
    }
}

/*
 * Simplified macros for NTT and iNTT (binary case) when the elements
 * are consecutive in RAM.
 */
#define modp_NTT2(a, gm, logn, p, p0i)   modp_NTT2_ext(a, 1, gm, logn, p, p0i)
#define modp_iNTT2(a, igm, logn, p, p0i) modp_iNTT2_ext(a, 1, igm, logn, p, p0i)

/*
 * Given polynomial f in NTT representation modulo p, compute f' of degree
 * less than N/2 such that f' = f0^2 - X*f1^2, where f0 and f1 are
 * polynomials of degree less than N/2 such that f = f0(X^2) + X*f1(X^2).
 *
 * The new polynomial is written "in place" over the first N/2 elements
 * of f.
 *
 * If applied logn times successively on a given polynomial, the resulting
 * degree-0 polynomial is the resultant of f and X^N+1 modulo p.
 *
 * This function applies only to the binary case; it is invoked from
 * solve_NTRU_binary_depth1().
 */
static void
modp_poly_rec_res(uint32_t *f, unsigned logn,
                  uint32_t p, uint32_t p0i, uint32_t R2) {
    size_t hn, u;

    hn = (size_t)1 << (logn - 1);
    for (u = 0; u < hn; u ++) {
        uint32_t w0, w1;

        w0 = f[(u << 1) + 0];
        w1 = f[(u << 1) + 1];
        f[u] = modp_montymul(modp_montymul(w0, w1, p, p0i), R2, p, p0i);
    }
}

/* ==================================================================== */
/*
 * Custom bignum implementation.
 *
 * This is a very reduced set of functionalities. We need to do the
 * following operations:
 *
 *  - Rebuild the resultant and the polynomial coefficients from their
 *    values modulo small primes (of length 31 bits each).
 *
 *  - Compute an extended GCD between the two computed resultants.
 *
 *  - Extract top bits and add scaled values during the successive steps
 *    of Babai rounding.
 *
 * When rebuilding values using CRT, we must also recompute the product
 * of the small prime factors. We always do it one small factor at a
 * time, so the "complicated" operations can be done modulo the small
 * prime with the modp_* functions. CRT coefficients (inverses) are
 * precomputed.
 *
 * All values are positive until the last step: when the polynomial
 * coefficients have been rebuilt, we normalize them around 0. But then,
 * only additions and subtractions on the upper few bits are needed
 * afterwards.
 *
 * We keep big integers as arrays of 31-bit words (in uint32_t values);
 * the top bit of each uint32_t is kept equal to 0. Using 31-bit words
 * makes it easier to keep track of carries. When negative values are
 * used, two's complement is used.
 */

/*
 * Subtract integer b from integer a. Both integers are supposed to have
 * the same size. The carry (0 or 1) is returned. Source arrays a and b
 * MUST be distinct.
 *
 * The operation is performed as described above if ctr = 1. If
 * ctl = 0, the value a[] is unmodified, but all memory accesses are
 * still performed, and the carry is computed and returned.
 */
static uint32_t
zint_sub(uint32_t *restrict a, const uint32_t *restrict b, size_t len,
         uint32_t ctl) {
    size_t u;
    uint32_t cc, m;

    cc = 0;
    m = -ctl;
    for (u = 0; u < len; u ++) {
        uint32_t aw, w;

        aw = a[u];
        w = aw - b[u] - cc;
        cc = w >> 31;
        aw ^= ((w & 0x7FFFFFFF) ^ aw) & m;
        a[u] = aw;
    }
    return cc;
}

/*
 * Mutiply the provided big integer m with a small value x.
 * This function assumes that x < 2^31. The carry word is returned.
 */
static uint32_t
zint_mul_small(uint32_t *m, size_t mlen, uint32_t x) {
    size_t u;
    uint32_t cc;

    cc = 0;
    for (u = 0; u < mlen; u ++) {
        uint64_t z;

        z = (uint64_t)m[u] * (uint64_t)x + cc;
        m[u] = (uint32_t)z & 0x7FFFFFFF;
        cc = (uint32_t)(z >> 31);
    }
    return cc;
}

/*
 * Reduce a big integer d modulo a small integer p.
 * Rules:
 *  d is unsigned
 *  p is prime
 *  2^30 < p < 2^31
 *  p0i = -(1/p) mod 2^31
 *  R2 = 2^62 mod p
 */
static uint32_t
zint_mod_small_unsigned(const uint32_t *d, size_t dlen,
                        uint32_t p, uint32_t p0i, uint32_t R2) {
    uint32_t x;
    size_t u;

    /*
     * Algorithm: we inject words one by one, starting with the high
     * word. Each step is:
     *  - multiply x by 2^31
     *  - add new word
     */
    x = 0;
    u = dlen;
    while (u -- > 0) {
        uint32_t w;

        x = modp_montymul(x, R2, p, p0i);
        w = d[u] - p;
        w += p & -(w >> 31);
        x = modp_add(x, w, p);
    }
    return x;
}

/*
 * Similar to zint_mod_small_unsigned(), except that d may be signed.
 * Extra parameter is Rx = 2^(31*dlen) mod p.
 */
static uint32_t
zint_mod_small_signed(const uint32_t *d, size_t dlen,
                      uint32_t p, uint32_t p0i, uint32_t R2, uint32_t Rx) {
    uint32_t z;

    if (dlen == 0) {
        return 0;
    }
    z = zint_mod_small_unsigned(d, dlen, p, p0i, R2);
    z = modp_sub(z, Rx & -(d[dlen - 1] >> 30), p);
    return z;
}

/*
 * Add y*s to x. x and y initially have length 'len' words; the new x
 * has length 'len+1' words. 's' must fit on 31 bits. x[] and y[] must
 * not overlap.
 */
static void
zint_add_mul_small(uint32_t *restrict x,
                   const uint32_t *restrict y, size_t len, uint32_t s) {
    size_t u;
    uint32_t cc;

    cc = 0;
    for (u = 0; u < len; u ++) {
        uint32_t xw, yw;
        uint64_t z;

        xw = x[u];
        yw = y[u];
        z = (uint64_t)yw * (uint64_t)s + (uint64_t)xw + (uint64_t)cc;
        x[u] = (uint32_t)z & 0x7FFFFFFF;
        cc = (uint32_t)(z >> 31);
    }
    x[len] = cc;
}

/*
 * Normalize a modular integer around 0: if x > p/2, then x is replaced
 * with x - p (signed encoding with two's complement); otherwise, x is
 * untouched. The two integers x and p are encoded over the same length.
 */
static void
zint_norm_zero(uint32_t *restrict x, const uint32_t *restrict p, size_t len) {
    size_t u;
    uint32_t r, bb;

    /*
     * Compare x with p/2. We use the shifted version of p, and p
     * is odd, so we really compare with (p-1)/2; we want to perform
     * the subtraction if and only if x > (p-1)/2.
     */
    r = 0;
    bb = 0;
    u = len;
    while (u -- > 0) {
        uint32_t wx, wp, cc;

        /*
         * Get the two words to compare in wx and wp (both over
         * 31 bits exactly).
         */
        wx = x[u];
        wp = (p[u] >> 1) | (bb << 30);
        bb = p[u] & 1;

        /*
         * We set cc to -1, 0 or 1, depending on whether wp is
         * lower than, equal to, or greater than wx.
         */
        cc = wp - wx;
        cc = ((-cc) >> 31) | -(cc >> 31);

        /*
         * If r != 0 then it is either 1 or -1, and we keep its
         * value. Otherwise, if r = 0, then we replace it with cc.
         */
        r |= cc & ((r & 1) - 1);
    }

    /*
     * At this point, r = -1, 0 or 1, depending on whether (p-1)/2
     * is lower than, equal to, or greater than x. We thus want to
     * do the subtraction only if r = -1.
     */
    zint_sub(x, p, len, r >> 31);
}

/*
 * Rebuild integers from their RNS representation. There are 'num'
 * integers, and each consists in 'xlen' words. 'xx' points at that
 * first word of the first integer; subsequent integers are accessed
 * by adding 'xstride' repeatedly.
 *
 * The words of an integer are the RNS representation of that integer,
 * using the provided 'primes' are moduli. This function replaces
 * each integer with its multi-word value (little-endian order).
 *
 * If "normalize_signed" is non-zero, then the returned value is
 * normalized to the -m/2..m/2 interval (where m is the product of all
 * small prime moduli); two's complement is used for negative values.
 */
static void
zint_rebuild_CRT(uint32_t *restrict xx, size_t xlen, size_t xstride,
                 size_t num, const small_prime *primes, int normalize_signed,
                 uint32_t *restrict tmp) {
    size_t u;
    uint32_t *x;

    tmp[0] = primes[0].p;
    for (u = 1; u < xlen; u ++) {
        /*
         * At the entry of each loop iteration:
         *  - the first u words of each array have been
         *    reassembled;
         *  - the first u words of tmp[] contains the
         * product of the prime moduli processed so far.
         *
         * We call 'q' the product of all previous primes.
         */
        uint32_t p, p0i, s, R2;
        size_t v;

        p = primes[u].p;
        s = primes[u].s;
        p0i = modp_ninv31(p);
        R2 = modp_R2(p, p0i);

        for (v = 0, x = xx; v < num; v ++, x += xstride) {
            uint32_t xp, xq, xr;
            /*
             * xp = the integer x modulo the prime p for this
             *      iteration
             * xq = (x mod q) mod p
             */
            xp = x[u];
            xq = zint_mod_small_unsigned(x, u, p, p0i, R2);

            /*
             * New value is (x mod q) + q * (s * (xp - xq) mod p)
             */
            xr = modp_montymul(s, modp_sub(xp, xq, p), p, p0i);
            zint_add_mul_small(x, tmp, u, xr);
        }

        /*
         * Update product of primes in tmp[].
         */
        tmp[u] = zint_mul_small(tmp, u, p);
    }

    /*
     * Normalize the reconstructed values around 0.
     */
    if (normalize_signed) {
        for (u = 0, x = xx; u < num; u ++, x += xstride) {
            zint_norm_zero(x, tmp, xlen);
        }
    }
}

/*
 * Negate a big integer conditionally: value a is replaced with -a if
 * and only if ctl = 1. Control value ctl must be 0 or 1.
 */
static void
zint_negate(uint32_t *a, size_t len, uint32_t ctl) {
    size_t u;
    uint32_t cc, m;

    /*
     * If ctl = 1 then we flip the bits of a by XORing with
     * 0x7FFFFFFF, and we add 1 to the value. If ctl = 0 then we XOR
     * with 0 and add 0, which leaves the value unchanged.
     */
    cc = ctl;
    m = -ctl >> 1;
    for (u = 0; u < len; u ++) {
        uint32_t aw;

        aw = a[u];
        aw = (aw ^ m) + cc;
        a[u] = aw & 0x7FFFFFFF;
        cc = aw >> 31;
    }
}

/*
 * Replace a with (a*xa+b*xb)/(2^31) and b with (a*ya+b*yb)/(2^31).
 * The low bits are dropped (the caller should compute the coefficients
 * such that these dropped bits are all zeros). If either or both
 * yields a negative value, then the value is negated.
 *
 * Returned value is:
 *  0  both values were positive
 *  1  new a had to be negated
 *  2  new b had to be negated
 *  3  both new a and new b had to be negated
 *
 * Coefficients xa, xb, ya and yb may use the full signed 32-bit range.
 */
static uint32_t
zint_co_reduce(uint32_t *a, uint32_t *b, size_t len,
               int64_t xa, int64_t xb, int64_t ya, int64_t yb) {
    size_t u;
    int64_t cca, ccb;
    uint32_t nega, negb;

    cca = 0;
    ccb = 0;
    for (u = 0; u < len; u ++) {
        uint32_t wa, wb;
        uint64_t za, zb;

        wa = a[u];
        wb = b[u];
        za = wa * (uint64_t)xa + wb * (uint64_t)xb + (uint64_t)cca;
        zb = wa * (uint64_t)ya + wb * (uint64_t)yb + (uint64_t)ccb;
        if (u > 0) {
            a[u - 1] = (uint32_t)za & 0x7FFFFFFF;
            b[u - 1] = (uint32_t)zb & 0x7FFFFFFF;
        }
        cca = *(int64_t *)&za >> 31;
        ccb = *(int64_t *)&zb >> 31;
    }
    a[len - 1] = (uint32_t)cca;
    b[len - 1] = (uint32_t)ccb;

    nega = (uint32_t)((uint64_t)cca >> 63);
    negb = (uint32_t)((uint64_t)ccb >> 63);
    zint_negate(a, len, nega);
    zint_negate(b, len, negb);
    return nega | (negb << 1);
}

/*
 * Finish modular reduction. Rules on input parameters:
 *
 *   if neg = 1, then -m <= a < 0
 *   if neg = 0, then 0 <= a < 2*m
 *
 * If neg = 0, then the top word of a[] is allowed to use 32 bits.
 *
 * Modulus m must be odd.
 */
static void
zint_finish_mod(uint32_t *a, size_t len, const uint32_t *m, uint32_t neg) {
    size_t u;
    uint32_t cc, xm, ym;

    /*
     * First pass: compare a (assumed nonnegative) with m. Note that
     * if the top word uses 32 bits, subtracting m must yield a
     * value less than 2^31 since a < 2*m.
     */
    cc = 0;
    for (u = 0; u < len; u ++) {
        cc = (a[u] - m[u] - cc) >> 31;
    }

    /*
     * If neg = 1 then we must add m (regardless of cc)
     * If neg = 0 and cc = 0 then we must subtract m
     * If neg = 0 and cc = 1 then we must do nothing
     *
     * In the loop below, we conditionally subtract either m or -m
     * from a. Word xm is a word of m (if neg = 0) or -m (if neg = 1);
     * but if neg = 0 and cc = 1, then ym = 0 and it forces mw to 0.
     */
    xm = -neg >> 1;
    ym = -(neg | (1 - cc));
    cc = neg;
    for (u = 0; u < len; u ++) {
        uint32_t aw, mw;

        aw = a[u];
        mw = (m[u] ^ xm) & ym;
        aw = aw - mw - cc;
        a[u] = aw & 0x7FFFFFFF;
        cc = aw >> 31;
    }
}

/*
 * Replace a with (a*xa+b*xb)/(2^31) mod m, and b with
 * (a*ya+b*yb)/(2^31) mod m. Modulus m must be odd; m0i = -1/m[0] mod 2^31.
 */
static void
zint_co_reduce_mod(uint32_t *a, uint32_t *b, const uint32_t *m, size_t len,
                   uint32_t m0i, int64_t xa, int64_t xb, int64_t ya, int64_t yb) {
    size_t u;
    int64_t cca, ccb;
    uint32_t fa, fb;

    /*
     * These are actually four combined Montgomery multiplications.
     */
    cca = 0;
    ccb = 0;
    fa = ((a[0] * (uint32_t)xa + b[0] * (uint32_t)xb) * m0i) & 0x7FFFFFFF;
    fb = ((a[0] * (uint32_t)ya + b[0] * (uint32_t)yb) * m0i) & 0x7FFFFFFF;
    for (u = 0; u < len; u ++) {
        uint32_t wa, wb;
        uint64_t za, zb;

        wa = a[u];
        wb = b[u];
        za = wa * (uint64_t)xa + wb * (uint64_t)xb
             + m[u] * (uint64_t)fa + (uint64_t)cca;
        zb = wa * (uint64_t)ya + wb * (uint64_t)yb
             + m[u] * (uint64_t)fb + (uint64_t)ccb;
        if (u > 0) {
            a[u - 1] = (uint32_t)za & 0x7FFFFFFF;
            b[u - 1] = (uint32_t)zb & 0x7FFFFFFF;
        }
        cca = *(int64_t *)&za >> 31;
        ccb = *(int64_t *)&zb >> 31;
    }
    a[len - 1] = (uint32_t)cca;
    b[len - 1] = (uint32_t)ccb;

    /*
     * At this point:
     *   -m <= a < 2*m
     *   -m <= b < 2*m
     * (this is a case of Montgomery reduction)
     * The top words of 'a' and 'b' may have a 32-th bit set.
     * We want to add or subtract the modulus, as required.
     */
    zint_finish_mod(a, len, m, (uint32_t)((uint64_t)cca >> 63));
    zint_finish_mod(b, len, m, (uint32_t)((uint64_t)ccb >> 63));
}

/*
 * Compute a GCD between two positive big integers x and y. The two
 * integers must be odd. Returned value is 1 if the GCD is 1, 0
 * otherwise. When 1 is returned, arrays u and v are filled with values
 * such that:
 *   0 <= u <= y
 *   0 <= v <= x
 *   x*u - y*v = 1
 * x[] and y[] are unmodified. Both input values must have the same
 * encoded length. Temporary array must be large enough to accommodate 4
 * extra values of that length. Arrays u, v and tmp may not overlap with
 * each other, or with either x or y.
 */
static int
zint_bezout(uint32_t *restrict u, uint32_t *restrict v,
            const uint32_t *restrict x, const uint32_t *restrict y,
            size_t len, uint32_t *restrict tmp) {
    /*
     * Algorithm is an extended binary GCD. We maintain 6 values
     * a, b, u0, u1, v0 and v1 with the following invariants:
     *
     *  a = x*u0 - y*v0
     *  b = x*u1 - y*v1
     *  0 <= a <= x
     *  0 <= b <= y
     *  0 <= u0 < y
     *  0 <= v0 < x
     *  0 <= u1 <= y
     *  0 <= v1 < x
     *
     * Initial values are:
     *
     *  a = x   u0 = 1   v0 = 0
     *  b = y   u1 = y   v1 = x-1
     *
     * Each iteration reduces either a or b, and maintains the
     * invariants. Algorithm stops when a = b, at which point their
     * common value is GCD(a,b) and (u0,v0) (or (u1,v1)) contains
     * the values (u,v) we want to return.
     *
     * The formal definition of the algorithm is a sequence of steps:
     *
     *  - If a is even, then:
     *        a <- a/2
     *        u0 <- u0/2 mod y
     *        v0 <- v0/2 mod x
     *
     *  - Otherwise, if b is even, then:
     *        b <- b/2
     *        u1 <- u1/2 mod y
     *        v1 <- v1/2 mod x
     *
     *  - Otherwise, if a > b, then:
     *        a <- (a-b)/2
     *        u0 <- (u0-u1)/2 mod y
     *        v0 <- (v0-v1)/2 mod x
     *
     *  - Otherwise:
     *        b <- (b-a)/2
     *        u1 <- (u1-u0)/2 mod y
     *        v1 <- (v1-v0)/2 mod y
     *
     * We can show that the operations above preserve the invariants:
     *
     *  - If a is even, then u0 and v0 are either both even or both
     *    odd (since a = x*u0 - y*v0, and x and y are both odd).
     *    If u0 and v0 are both even, then (u0,v0) <- (u0/2,v0/2).
     *    Otherwise, (u0,v0) <- ((u0+y)/2,(v0+x)/2). Either way,
     *    the a = x*u0 - y*v0 invariant is preserved.
     *
     *  - The same holds for the case where b is even.
     *
     *  - If a and b are odd, and a > b, then:
     *
     *      a-b = x*(u0-u1) - y*(v0-v1)
     *
     *    In that situation, if u0 < u1, then x*(u0-u1) < 0, but
     *    a-b > 0; therefore, it must be that v0 < v1, and the
     *    first part of the update is: (u0,v0) <- (u0-u1+y,v0-v1+x),
     *    which preserves the invariants. Otherwise, if u0 > u1,
     *    then u0-u1 >= 1, thus x*(u0-u1) >= x. But a <= x and
     *    b >= 0, hence a-b <= x. It follows that, in that case,
     *    v0-v1 >= 0. The first part of the update is then:
     *    (u0,v0) <- (u0-u1,v0-v1), which again preserves the
     *    invariants.
     *
     *    Either way, once the subtraction is done, the new value of
     *    a, which is the difference of two odd values, is even,
     *    and the remaining of this step is a subcase of the
     *    first algorithm case (i.e. when a is even).
     *
     *  - If a and b are odd, and b > a, then the a similar
     *    argument holds.
     *
     * The values a and b start at x and y, respectively. Since x
     * and y are odd, their GCD is odd, and it is easily seen that
     * all steps conserve the GCD (GCD(a-b,b) = GCD(a, b);
     * GCD(a/2,b) = GCD(a,b) if GCD(a,b) is odd). Moreover, either a
     * or b is reduced by at least one bit at each iteration, so
     * the algorithm necessarily converges on the case a = b, at
     * which point the common value is the GCD.
     *
     * In the algorithm expressed above, when a = b, the fourth case
     * applies, and sets b = 0. Since a contains the GCD of x and y,
     * which are both odd, a must be odd, and subsequent iterations
     * (if any) will simply divide b by 2 repeatedly, which has no
     * consequence. Thus, the algorithm can run for more iterations
     * than necessary; the final GCD will be in a, and the (u,v)
     * coefficients will be (u0,v0).
     *
     *
     * The presentation above is bit-by-bit. It can be sped up by
     * noticing that all decisions are taken based on the low bits
     * and high bits of a and b. We can extract the two top words
     * and low word of each of a and b, and compute reduction
     * parameters pa, pb, qa and qb such that the new values for
     * a and b are:
     *    a' = (a*pa + b*pb) / (2^31)
     *    b' = (a*qa + b*qb) / (2^31)
     * the two divisions being exact. The coefficients are obtained
     * just from the extracted words, and may be slightly off, requiring
     * an optional correction: if a' < 0, then we replace pa with -pa
     * and pb with -pb. Each such step will reduce the total length
     * (sum of lengths of a and b) by at least 30 bits at each
     * iteration.
     */
    uint32_t *u0, *u1, *v0, *v1, *a, *b;
    uint32_t x0i, y0i;
    uint32_t num, rc;
    size_t j;

    if (len == 0) {
        return 0;
    }

    /*
     * u0 and v0 are the u and v result buffers; the four other
     * values (u1, v1, a and b) are taken from tmp[].
     */
    u0 = u;
    v0 = v;
    u1 = tmp;
    v1 = u1 + len;
    a = v1 + len;
    b = a + len;

    /*
     * We'll need the Montgomery reduction coefficients.
     */
    x0i = modp_ninv31(x[0]);
    y0i = modp_ninv31(y[0]);

    /*
     * Initialize a, b, u0, u1, v0 and v1.
     *  a = x   u0 = 1   v0 = 0
     *  b = y   u1 = y   v1 = x-1
     * Note that x is odd, so computing x-1 is easy.
     */
    memcpy(a, x, len * sizeof * x);
    memcpy(b, y, len * sizeof * y);
    u0[0] = 1;
    memset(u0 + 1, 0, (len - 1) * sizeof * u0);
    memset(v0, 0, len * sizeof * v0);
    memcpy(u1, y, len * sizeof * u1);
    memcpy(v1, x, len * sizeof * v1);
    v1[0] --;

    /*
     * Each input operand may be as large as 31*len bits, and we
     * reduce the total length by at least 30 bits at each iteration.
     */
    for (num = 62 * (uint32_t)len + 30; num >= 30; num -= 30) {
        uint32_t c0, c1;
        uint32_t a0, a1, b0, b1;
        uint64_t a_hi, b_hi;
        uint32_t a_lo, b_lo;
        int64_t pa, pb, qa, qb;
        int i;
        uint32_t r;

        /*
         * Extract the top words of a and b. If j is the highest
         * index >= 1 such that a[j] != 0 or b[j] != 0, then we
         * want (a[j] << 31) + a[j-1] and (b[j] << 31) + b[j-1].
         * If a and b are down to one word each, then we use
         * a[0] and b[0].
         */
        c0 = (uint32_t) -1;
        c1 = (uint32_t) -1;
        a0 = 0;
        a1 = 0;
        b0 = 0;
        b1 = 0;
        j = len;
        while (j -- > 0) {
            uint32_t aw, bw;

            aw = a[j];
            bw = b[j];
            a0 ^= (a0 ^ aw) & c0;
            a1 ^= (a1 ^ aw) & c1;
            b0 ^= (b0 ^ bw) & c0;
            b1 ^= (b1 ^ bw) & c1;
            c1 = c0;
            c0 &= (((aw | bw) + 0x7FFFFFFF) >> 31) - (uint32_t)1;
        }

        /*
         * If c1 = 0, then we grabbed two words for a and b.
         * If c1 != 0 but c0 = 0, then we grabbed one word. It
         * is not possible that c1 != 0 and c0 != 0, because that
         * would mean that both integers are zero.
         */
        a1 |= a0 & c1;
        a0 &= ~c1;
        b1 |= b0 & c1;
        b0 &= ~c1;
        a_hi = ((uint64_t)a0 << 31) + a1;
        b_hi = ((uint64_t)b0 << 31) + b1;
        a_lo = a[0];
        b_lo = b[0];

        /*
         * Compute reduction factors:
         *
         *   a' = a*pa + b*pb
         *   b' = a*qa + b*qb
         *
         * such that a' and b' are both multiple of 2^31, but are
         * only marginally larger than a and b.
         */
        pa = 1;
        pb = 0;
        qa = 0;
        qb = 1;
        for (i = 0; i < 31; i ++) {
            /*
             * At each iteration:
             *
             *   a <- (a-b)/2 if: a is odd, b is odd, a_hi > b_hi
             *   b <- (b-a)/2 if: a is odd, b is odd, a_hi <= b_hi
             *   a <- a/2 if: a is even
             *   b <- b/2 if: a is odd, b is even
             *
             * We multiply a_lo and b_lo by 2 at each
             * iteration, thus a division by 2 really is a
             * non-multiplication by 2.
             */
            uint32_t rt, oa, ob, cAB, cBA, cA;
            uint64_t rz;

            /*
             * rt = 1 if a_hi > b_hi, 0 otherwise.
             */
            rz = b_hi - a_hi;
            rt = (uint32_t)((rz ^ ((a_hi ^ b_hi)
                                   & (a_hi ^ rz))) >> 63);

            /*
             * cAB = 1 if b must be subtracted from a
             * cBA = 1 if a must be subtracted from b
             * cA = 1 if a must be divided by 2
             *
             * Rules:
             *
             *   cAB and cBA cannot both be 1.
             *   If a is not divided by 2, b is.
             */
            oa = (a_lo >> i) & 1;
            ob = (b_lo >> i) & 1;
            cAB = oa & ob & rt;
            cBA = oa & ob & ~rt;
            cA = cAB | (oa ^ 1);

            /*
             * Conditional subtractions.
             */
            a_lo -= b_lo & -cAB;
            a_hi -= b_hi & -(uint64_t)cAB;
            pa -= qa & -(int64_t)cAB;
            pb -= qb & -(int64_t)cAB;
            b_lo -= a_lo & -cBA;
            b_hi -= a_hi & -(uint64_t)cBA;
            qa -= pa & -(int64_t)cBA;
            qb -= pb & -(int64_t)cBA;

            /*
             * Shifting.
             */
            a_lo += a_lo & (cA - 1);
            pa += pa & ((int64_t)cA - 1);
            pb += pb & ((int64_t)cA - 1);
            a_hi ^= (a_hi ^ (a_hi >> 1)) & -(uint64_t)cA;
            b_lo += b_lo & -cA;
            qa += qa & -(int64_t)cA;
            qb += qb & -(int64_t)cA;
            b_hi ^= (b_hi ^ (b_hi >> 1)) & ((uint64_t)cA - 1);
        }

        /*
         * Apply the computed parameters to our values. We
         * may have to correct pa and pb depending on the
         * returned value of zint_co_reduce() (when a and/or b
         * had to be negated).
         */
        r = zint_co_reduce(a, b, len, pa, pb, qa, qb);
        pa -= (pa + pa) & -(int64_t)(r & 1);
        pb -= (pb + pb) & -(int64_t)(r & 1);
        qa -= (qa + qa) & -(int64_t)(r >> 1);
        qb -= (qb + qb) & -(int64_t)(r >> 1);
        zint_co_reduce_mod(u0, u1, y, len, y0i, pa, pb, qa, qb);
        zint_co_reduce_mod(v0, v1, x, len, x0i, pa, pb, qa, qb);
    }

    /*
     * At that point, array a[] should contain the GCD, and the
     * results (u,v) should already be set. We check that the GCD
     * is indeed 1. We also check that the two operands x and y
     * are odd.
     */
    rc = a[0] ^ 1;
    for (j = 1; j < len; j ++) {
        rc |= a[j];
    }
    return (int)((1 - ((rc | -rc) >> 31)) & x[0] & y[0]);
}

/*
 * Add k*y*2^sc to x. The result is assumed to fit in the array of
 * size xlen (truncation is applied if necessary).
 * Scale factor 'sc' is provided as sch and scl, such that:
 *   sch = sc / 31
 *   scl = sc % 31
 * xlen MUST NOT be lower than ylen.
 *
 * x[] and y[] are both signed integers, using two's complement for
 * negative values.
 */
static void
zint_add_scaled_mul_small(uint32_t *restrict x, size_t xlen,
                          const uint32_t *restrict y, size_t ylen, int32_t k,
                          uint32_t sch, uint32_t scl) {
    size_t u;
    uint32_t ysign, tw;
    int32_t cc;

    if (ylen == 0) {
        return;
    }

    ysign = -(y[ylen - 1] >> 30) >> 1;
    tw = 0;
    cc = 0;
    for (u = sch; u < xlen; u ++) {
        size_t v;
        uint32_t wy, wys, ccu;
        uint64_t z;

        /*
         * Get the next word of y (scaled).
         */
        v = u - sch;
        wy = v < ylen ? y[v] : ysign;
        wys = ((wy << scl) & 0x7FFFFFFF) | tw;
        tw = wy >> (31 - scl);

        /*
         * The expression below does not overflow.
         */
        z = (uint64_t)((int64_t)wys * (int64_t)k + (int64_t)x[u] + cc);
        x[u] = (uint32_t)z & 0x7FFFFFFF;

        /*
         * Right-shifting the signed value z would yield
         * implementation-defined results (arithmetic shift is
         * not guaranteed). However, we can cast to unsigned,
         * and get the next carry as an unsigned word. We can
         * then convert it back to signed by using the guaranteed
         * fact that 'int32_t' uses two's complement with no
         * trap representation or padding bit, and with a layout
         * compatible with that of 'uint32_t'.
         */
        ccu = (uint32_t)(z >> 31);
        cc = *(int32_t *)&ccu;
    }
}

/*
 * Subtract y*2^sc from x. The result is assumed to fit in the array of
 * size xlen (truncation is applied if necessary).
 * Scale factor 'sc' is provided as sch and scl, such that:
 *   sch = sc / 31
 *   scl = sc % 31
 * xlen MUST NOT be lower than ylen.
 *
 * x[] and y[] are both signed integers, using two's complement for
 * negative values.
 */
static void
zint_sub_scaled(uint32_t *restrict x, size_t xlen,
                const uint32_t *restrict y, size_t ylen, uint32_t sch, uint32_t scl) {
    size_t u;
    uint32_t ysign, tw;
    uint32_t cc;

    if (ylen == 0) {
        return;
    }

    ysign = -(y[ylen - 1] >> 30) >> 1;
    tw = 0;
    cc = 0;
    for (u = sch; u < xlen; u ++) {
        size_t v;
        uint32_t w, wy, wys;

        /*
         * Get the next word of y (scaled).
         */
        v = u - sch;
        wy = v < ylen ? y[v] : ysign;
        wys = ((wy << scl) & 0x7FFFFFFF) | tw;
        tw = wy >> (31 - scl);

        w = x[u] - wys - cc;
        x[u] = w & 0x7FFFFFFF;
        cc = w >> 31;
    }
}

/*
 * Convert a one-word signed big integer into a signed value.
 */
static inline int32_t
zint_one_to_plain(const uint32_t *x) {
    uint32_t w;

    w = x[0];
    w |= (w & 0x40000000) << 1;
    return *(int32_t *)&w;
}

/* ==================================================================== */

/*
 * Convert a polynomial to floating-point values.
 *
 * Each coefficient has length flen words, and starts fstride words after
 * the previous.
 *
 * IEEE-754 binary64 values can represent values in a finite range,
 * roughly 2^(-1023) to 2^(+1023); thus, if coefficients are too large,
 * they should be "trimmed" by pointing not to the lowest word of each,
 * but upper.
 */
static void
poly_big_to_fp(fpr *d, const uint32_t *f, size_t flen, size_t fstride,
               unsigned logn) {
    size_t n, u;

    n = MKN(logn);
    if (flen == 0) {
        for (u = 0; u < n; u ++) {
            d[u] = fpr_zero;
        }
        return;
    }
    for (u = 0; u < n; u ++, f += fstride) {
        size_t v;
        uint32_t neg, cc, xm;
        fpr x, fsc;

        /*
         * Get sign of the integer; if it is negative, then we
         * will load its absolute value instead, and negate the
         * result.
         */
        neg = -(f[flen - 1] >> 30);
        xm = neg >> 1;
        cc = neg & 1;
        x = fpr_zero;
        fsc = fpr_one;
        for (v = 0; v < flen; v++, fsc = fpr_mul(fsc, fpr_ptwo31)) {
            uint32_t w;

            w = (f[v] ^ xm) + cc;
            cc = w >> 31;
            w &= 0x7FFFFFFF;
            w -= (w << 1) & neg;
            x = fpr_add(x, fpr_mul(fpr_of(*(int32_t *)&w), fsc));
        }
        d[u] = x;
    }
}

/*
 * Convert a polynomial to small integers. Source values are supposed
 * to be one-word integers, signed over 31 bits. Returned value is 0
 * if any of the coefficients exceeds the provided limit (in absolute
 * value), or 1 on success.
 *
 * This is not constant-time; this is not a problem here, because on
 * any failure, the NTRU-solving process will be deemed to have failed
 * and the (f,g) polynomials will be discarded.
 */
static int
poly_big_to_small(int8_t *d, const uint32_t *s, int lim, unsigned logn) {
    size_t n, u;

    n = MKN(logn);
    for (u = 0; u < n; u ++) {
        int32_t z;

        z = zint_one_to_plain(s + u);
        if (z < -lim || z > lim) {
            return 0;
        }
        d[u] = (int8_t)z;
    }
    return 1;
}

/*
 * Subtract k*f from F, where F, f and k are polynomials modulo X^N+1.
 * Coefficients of polynomial k are small integers (signed values in the
 * -2^31..2^31 range) scaled by 2^sc. Value sc is provided as sch = sc / 31
 * and scl = sc % 31.
 *
 * This function implements the basic quadratic multiplication algorithm,
 * which is efficient in space (no extra buffer needed) but slow at
 * high degree.
 */
static void
poly_sub_scaled(uint32_t *restrict F, size_t Flen, size_t Fstride,
                const uint32_t *restrict f, size_t flen, size_t fstride,
                const int32_t *restrict k, uint32_t sch, uint32_t scl, unsigned logn) {
    size_t n, u;

    n = MKN(logn);
    for (u = 0; u < n; u ++) {
        int32_t kf;
        size_t v;
        uint32_t *x;
        const uint32_t *y;

        kf = -k[u];
        x = F + u * Fstride;
        y = f;
        for (v = 0; v < n; v ++) {
            zint_add_scaled_mul_small(
                x, Flen, y, flen, kf, sch, scl);
            if (u + v == n - 1) {
                x = F;
                kf = -kf;
            } else {
                x += Fstride;
            }
            y += fstride;
        }
    }
}

/*
 * Subtract k*f from F. Coefficients of polynomial k are small integers
 * (signed values in the -2^31..2^31 range) scaled by 2^sc. This function
 * assumes that the degree is large, and integers relatively small.
 * The value sc is provided as sch = sc / 31 and scl = sc % 31.
 */
static void
poly_sub_scaled_ntt(uint32_t *restrict F, size_t Flen, size_t Fstride,
                    const uint32_t *restrict f, size_t flen, size_t fstride,
                    const int32_t *restrict k, uint32_t sch, uint32_t scl, unsigned logn,
                    uint32_t *restrict tmp) {
    uint32_t *gm, *igm, *fk, *t1, *x;
    const uint32_t *y;
    size_t n, u, tlen;
    const small_prime *primes;

    n = MKN(logn);
    tlen = flen + 1;
    gm = tmp;
    igm = gm + MKN(logn);
    fk = igm + MKN(logn);
    t1 = fk + n * tlen;

    primes = PRIMES;

    /*
     * Compute k*f in fk[], in RNS notation.
     */
    for (u = 0; u < tlen; u ++) {
        uint32_t p, p0i, R2, Rx;
        size_t v;

        p = primes[u].p;
        p0i = modp_ninv31(p);
        R2 = modp_R2(p, p0i);
        Rx = modp_Rx((unsigned)flen, p, p0i, R2);
        modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i);

        for (v = 0; v < n; v ++) {
            t1[v] = modp_set(k[v], p);
        }
        modp_NTT2(t1, gm, logn, p, p0i);
        for (v = 0, y = f, x = fk + u;
                v < n; v ++, y += fstride, x += tlen) {
            *x = zint_mod_small_signed(y, flen, p, p0i, R2, Rx);
        }
        modp_NTT2_ext(fk + u, tlen, gm, logn, p, p0i);
        for (v = 0, x = fk + u; v < n; v ++, x += tlen) {
            *x = modp_montymul(
                     modp_montymul(t1[v], *x, p, p0i), R2, p, p0i);
        }
        modp_iNTT2_ext(fk + u, tlen, igm, logn, p, p0i);
    }

    /*
     * Rebuild k*f.
     */
    zint_rebuild_CRT(fk, tlen, tlen, n, primes, 1, t1);

    /*
     * Subtract k*f, scaled, from F.
     */
    for (u = 0, x = F, y = fk; u < n; u ++, x += Fstride, y += tlen) {
        zint_sub_scaled(x, Flen, y, tlen, sch, scl);
    }
}

/* ==================================================================== */

#define RNG_CONTEXT   inner_shake256_context

/*
 * Get a random 8-byte integer from a SHAKE-based RNG. This function
 * ensures consistent interpretation of the SHAKE output so that
 * the same values will be obtained over different platforms, in case
 * a known seed is used.
 */
static inline uint64_t
get_rng_u64(inner_shake256_context *rng) {
    /*
     * We enforce little-endian representation.
     */

    uint8_t tmp[8];

    inner_shake256_extract(rng, tmp, sizeof tmp);
    return (uint64_t)tmp[0]
           | ((uint64_t)tmp[1] << 8)
           | ((uint64_t)tmp[2] << 16)
           | ((uint64_t)tmp[3] << 24)
           | ((uint64_t)tmp[4] << 32)
           | ((uint64_t)tmp[5] << 40)
           | ((uint64_t)tmp[6] << 48)
           | ((uint64_t)tmp[7] << 56);
}

/*
 * Table below incarnates a discrete Gaussian distribution:
 *    D(x) = exp(-(x^2)/(2*sigma^2))
 * where sigma = 1.17*sqrt(q/(2*N)), q = 12289, and N = 1024.
 * Element 0 of the table is P(x = 0).
 * For k > 0, element k is P(x >= k+1 | x > 0).
 * Probabilities are scaled up by 2^63.
 */
static const uint64_t gauss_1024_12289[] = {
    1283868770400643928u,  6416574995475331444u,  4078260278032692663u,
    2353523259288686585u,  1227179971273316331u,   575931623374121527u,
    242543240509105209u,    91437049221049666u,    30799446349977173u,
    9255276791179340u,     2478152334826140u,      590642893610164u,
    125206034929641u,       23590435911403u,        3948334035941u,
    586753615614u,          77391054539u,           9056793210u,
    940121950u,             86539696u,              7062824u,
    510971u,                32764u,                 1862u,
    94u,                    4u,                    0u
};

/*
 * Generate a random value with a Gaussian distribution centered on 0.
 * The RNG must be ready for extraction (already flipped).
 *
 * Distribution has standard deviation 1.17*sqrt(q/(2*N)). The
 * precomputed table is for N = 1024. Since the sum of two independent
 * values of standard deviation sigma has standard deviation
 * sigma*sqrt(2), then we can just generate more values and add them
 * together for lower dimensions.
 */
static int
mkgauss(RNG_CONTEXT *rng, unsigned logn) {
    unsigned u, g;
    int val;

    g = 1U << (10 - logn);
    val = 0;
    for (u = 0; u < g; u ++) {
        /*
         * Each iteration generates one value with the
         * Gaussian distribution for N = 1024.
         *
         * We use two random 64-bit values. First value
         * decides on whether the generated value is 0, and,
         * if not, the sign of the value. Second random 64-bit
         * word is used to generate the non-zero value.
         *
         * For constant-time code we have to read the complete
         * table. This has negligible cost, compared with the
         * remainder of the keygen process (solving the NTRU
         * equation).
         */
        uint64_t r;
        uint32_t f, v, k, neg;

        /*
         * First value:
         *  - flag 'neg' is randomly selected to be 0 or 1.
         *  - flag 'f' is set to 1 if the generated value is zero,
         *    or set to 0 otherwise.
         */
        r = get_rng_u64(rng);
        neg = (uint32_t)(r >> 63);
        r &= ~((uint64_t)1 << 63);
        f = (uint32_t)((r - gauss_1024_12289[0]) >> 63);

        /*
         * We produce a new random 63-bit integer r, and go over
         * the array, starting at index 1. We store in v the
         * index of the first array element which is not greater
         * than r, unless the flag f was already 1.
         */
        v = 0;
        r = get_rng_u64(rng);
        r &= ~((uint64_t)1 << 63);
        for (k = 1; k < (sizeof gauss_1024_12289)
                / (sizeof gauss_1024_12289[0]); k ++) {
            uint32_t t;

            t = (uint32_t)((r - gauss_1024_12289[k]) >> 63) ^ 1;
            v |= k & -(t & (f ^ 1));
            f |= t;
        }

        /*
         * We apply the sign ('neg' flag). If the value is zero,
         * the sign has no effect.
         */
        v = (v ^ -neg) + neg;

        /*
         * Generated value is added to val.
         */
        val += *(int32_t *)&v;
    }
    return val;
}

/*
 * The MAX_BL_SMALL[] and MAX_BL_LARGE[] contain the lengths, in 31-bit
 * words, of intermediate values in the computation:
 *
 *   MAX_BL_SMALL[depth]: length for the input f and g at that depth
 *   MAX_BL_LARGE[depth]: length for the unreduced F and G at that depth
 *
 * Rules:
 *
 *  - Within an array, values grow.
 *
 *  - The 'SMALL' array must have an entry for maximum depth, corresponding
 *    to the size of values used in the binary GCD. There is no such value
 *    for the 'LARGE' array (the binary GCD yields already reduced
 *    coefficients).
 *
 *  - MAX_BL_LARGE[depth] >= MAX_BL_SMALL[depth + 1].
 *
 *  - Values must be large enough to handle the common cases, with some
 *    margins.
 *
 *  - Values must not be "too large" either because we will convert some
 *    integers into floating-point values by considering the top 10 words,
 *    i.e. 310 bits; hence, for values of length more than 10 words, we
 *    should take care to have the length centered on the expected size.
 *
 * The following average lengths, in bits, have been measured on thousands
 * of random keys (fg = max length of the absolute value of coefficients
 * of f and g at that depth; FG = idem for the unreduced F and G; for the
 * maximum depth, F and G are the output of binary GCD, multiplied by q;
 * for each value, the average and standard deviation are provided).
 *
 * Binary case:
 *    depth: 10    fg: 6307.52 (24.48)    FG: 6319.66 (24.51)
 *    depth:  9    fg: 3138.35 (12.25)    FG: 9403.29 (27.55)
 *    depth:  8    fg: 1576.87 ( 7.49)    FG: 4703.30 (14.77)
 *    depth:  7    fg:  794.17 ( 4.98)    FG: 2361.84 ( 9.31)
 *    depth:  6    fg:  400.67 ( 3.10)    FG: 1188.68 ( 6.04)
 *    depth:  5    fg:  202.22 ( 1.87)    FG:  599.81 ( 3.87)
 *    depth:  4    fg:  101.62 ( 1.02)    FG:  303.49 ( 2.38)
 *    depth:  3    fg:   50.37 ( 0.53)    FG:  153.65 ( 1.39)
 *    depth:  2    fg:   24.07 ( 0.25)    FG:   78.20 ( 0.73)
 *    depth:  1    fg:   10.99 ( 0.08)    FG:   39.82 ( 0.41)
 *    depth:  0    fg:    4.00 ( 0.00)    FG:   19.61 ( 0.49)
 *
 * Integers are actually represented either in binary notation over
 * 31-bit words (signed, using two's complement), or in RNS, modulo
 * many small primes. These small primes are close to, but slightly
 * lower than, 2^31. Use of RNS loses less than two bits, even for
 * the largest values.
 *
 * IMPORTANT: if these values are modified, then the temporary buffer
 * sizes (FALCON_KEYGEN_TEMP_*, in inner.h) must be recomputed
 * accordingly.
 */

static const size_t MAX_BL_SMALL[] = {
    1, 1, 2, 2, 4, 7, 14, 27, 53, 106, 209
};

static const size_t MAX_BL_LARGE[] = {
    2, 2, 5, 7, 12, 21, 40, 78, 157, 308
};

/*
 * Average and standard deviation for the maximum size (in bits) of
 * coefficients of (f,g), depending on depth. These values are used
 * to compute bounds for Babai's reduction.
 */
static const struct {
    int avg;
    int std;
} BITLENGTH[] = {
    {    4,  0 },
    {   11,  1 },
    {   24,  1 },
    {   50,  1 },
    {  102,  1 },
    {  202,  2 },
    {  401,  4 },
    {  794,  5 },
    { 1577,  8 },
    { 3138, 13 },
    { 6308, 25 }
};

/*
 * Minimal recursion depth at which we rebuild intermediate values
 * when reconstructing f and g.
 */
#define DEPTH_INT_FG   4

/*
 * Compute squared norm of a short vector. Returned value is saturated to
 * 2^32-1 if it is not lower than 2^31.
 */
static uint32_t
poly_small_sqnorm(const int8_t *f, unsigned logn) {
    size_t n, u;
    uint32_t s, ng;

    n = MKN(logn);
    s = 0;
    ng = 0;
    for (u = 0; u < n; u ++) {
        int32_t z;

        z = f[u];
        s += (uint32_t)(z * z);
        ng |= s;
    }
    return s | -(ng >> 31);
}

/*
 * Align (upwards) the provided 'data' pointer with regards to 'base'
 * so that the offset is a multiple of the size of 'fpr'.
 */
static fpr *
align_fpr(void *base, void *data) {
    uint8_t *cb, *cd;
    size_t k, km;

    cb = base;
    cd = data;
    k = (size_t)(cd - cb);
    km = k % sizeof(fpr);
    if (km) {
        k += (sizeof(fpr)) - km;
    }
    return (fpr *)(cb + k);
}

/*
 * Align (upwards) the provided 'data' pointer with regards to 'base'
 * so that the offset is a multiple of the size of 'uint32_t'.
 */
static uint32_t *
align_u32(void *base, void *data) {
    uint8_t *cb, *cd;
    size_t k, km;

    cb = base;
    cd = data;
    k = (size_t)(cd - cb);
    km = k % sizeof(uint32_t);
    if (km) {
        k += (sizeof(uint32_t)) - km;
    }
    return (uint32_t *)(cb + k);
}

/*
 * Input: f,g of degree N = 2^logn; 'depth' is used only to get their
 * individual length.
 *
 * Output: f',g' of degree N/2, with the length for 'depth+1'.
 *
 * Values are in RNS; input and/or output may also be in NTT.
 */
static void
make_fg_step(uint32_t *data, unsigned logn, unsigned depth,
             int in_ntt, int out_ntt) {
    size_t n, hn, u;
    size_t slen, tlen;
    uint32_t *fd, *gd, *fs, *gs, *gm, *igm, *t1;
    const small_prime *primes;

    n = (size_t)1 << logn;
    hn = n >> 1;
    slen = MAX_BL_SMALL[depth];
    tlen = MAX_BL_SMALL[depth + 1];
    primes = PRIMES;

    /*
     * Prepare room for the result.
     */
    fd = data;
    gd = fd + hn * tlen;
    fs = gd + hn * tlen;
    gs = fs + n * slen;
    gm = gs + n * slen;
    igm = gm + n;
    t1 = igm + n;
    memmove(fs, data, 2 * n * slen * sizeof * data);

    /*
     * First slen words: we use the input values directly, and apply
     * inverse NTT as we go.
     */
    for (u = 0; u < slen; u ++) {
        uint32_t p, p0i, R2;
        size_t v;
        uint32_t *x;

        p = primes[u].p;
        p0i = modp_ninv31(p);
        R2 = modp_R2(p, p0i);
        modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i);

        for (v = 0, x = fs + u; v < n; v ++, x += slen) {
            t1[v] = *x;
        }
        if (!in_ntt) {
            modp_NTT2(t1, gm, logn, p, p0i);
        }
        for (v = 0, x = fd + u; v < hn; v ++, x += tlen) {
            uint32_t w0, w1;

            w0 = t1[(v << 1) + 0];
            w1 = t1[(v << 1) + 1];
            *x = modp_montymul(
                     modp_montymul(w0, w1, p, p0i), R2, p, p0i);
        }
        if (in_ntt) {
            modp_iNTT2_ext(fs + u, slen, igm, logn, p, p0i);
        }

        for (v = 0, x = gs + u; v < n; v ++, x += slen) {
            t1[v] = *x;
        }
        if (!in_ntt) {
            modp_NTT2(t1, gm, logn, p, p0i);
        }
        for (v = 0, x = gd + u; v < hn; v ++, x += tlen) {
            uint32_t w0, w1;

            w0 = t1[(v << 1) + 0];
            w1 = t1[(v << 1) + 1];
            *x = modp_montymul(
                     modp_montymul(w0, w1, p, p0i), R2, p, p0i);
        }
        if (in_ntt) {
            modp_iNTT2_ext(gs + u, slen, igm, logn, p, p0i);
        }

        if (!out_ntt) {
            modp_iNTT2_ext(fd + u, tlen, igm, logn - 1, p, p0i);
            modp_iNTT2_ext(gd + u, tlen, igm, logn - 1, p, p0i);
        }
    }

    /*
     * Since the fs and gs words have been de-NTTized, we can use the
     * CRT to rebuild the values.
     */
    zint_rebuild_CRT(fs, slen, slen, n, primes, 1, gm);
    zint_rebuild_CRT(gs, slen, slen, n, primes, 1, gm);

    /*
     * Remaining words: use modular reductions to extract the values.
     */
    for (u = slen; u < tlen; u ++) {
        uint32_t p, p0i, R2, Rx;
        size_t v;
        uint32_t *x;

        p = primes[u].p;
        p0i = modp_ninv31(p);
        R2 = modp_R2(p, p0i);
        Rx = modp_Rx((unsigned)slen, p, p0i, R2);
        modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i);
        for (v = 0, x = fs; v < n; v ++, x += slen) {
            t1[v] = zint_mod_small_signed(x, slen, p, p0i, R2, Rx);
        }
        modp_NTT2(t1, gm, logn, p, p0i);
        for (v = 0, x = fd + u; v < hn; v ++, x += tlen) {
            uint32_t w0, w1;

            w0 = t1[(v << 1) + 0];
            w1 = t1[(v << 1) + 1];
            *x = modp_montymul(
                     modp_montymul(w0, w1, p, p0i), R2, p, p0i);
        }
        for (v = 0, x = gs; v < n; v ++, x += slen) {
            t1[v] = zint_mod_small_signed(x, slen, p, p0i, R2, Rx);
        }
        modp_NTT2(t1, gm, logn, p, p0i);
        for (v = 0, x = gd + u; v < hn; v ++, x += tlen) {
            uint32_t w0, w1;

            w0 = t1[(v << 1) + 0];
            w1 = t1[(v << 1) + 1];
            *x = modp_montymul(
                     modp_montymul(w0, w1, p, p0i), R2, p, p0i);
        }

        if (!out_ntt) {
            modp_iNTT2_ext(fd + u, tlen, igm, logn - 1, p, p0i);
            modp_iNTT2_ext(gd + u, tlen, igm, logn - 1, p, p0i);
        }
    }
}

/*
 * Compute f and g at a specific depth, in RNS notation.
 *
 * Returned values are stored in the data[] array, at slen words per integer.
 *
 * Conditions:
 *   0 <= depth <= logn
 *
 * Space use in data[]: enough room for any two successive values (f', g',
 * f and g).
 */
static void
make_fg(uint32_t *data, const int8_t *f, const int8_t *g,
        unsigned logn, unsigned depth, int out_ntt) {
    size_t n, u;
    uint32_t *ft, *gt, p0;
    unsigned d;
    const small_prime *primes;

    n = MKN(logn);
    ft = data;
    gt = ft + n;
    primes = PRIMES;
    p0 = primes[0].p;
    for (u = 0; u < n; u ++) {
        ft[u] = modp_set(f[u], p0);
        gt[u] = modp_set(g[u], p0);
    }

    if (depth == 0 && out_ntt) {
        uint32_t *gm, *igm;
        uint32_t p, p0i;

        p = primes[0].p;
        p0i = modp_ninv31(p);
        gm = gt + n;
        igm = gm + MKN(logn);
        modp_mkgm2(gm, igm, logn, primes[0].g, p, p0i);
        modp_NTT2(ft, gm, logn, p, p0i);
        modp_NTT2(gt, gm, logn, p, p0i);
        return;
    }

    for (d = 0; d < depth; d ++) {
        make_fg_step(data, logn - d, d,
                     d != 0, (d + 1) < depth || out_ntt);
    }
}

/*
 * Solving the NTRU equation, deepest level: compute the resultants of
 * f and g with X^N+1, and use binary GCD. The F and G values are
 * returned in tmp[].
 *
 * Returned value: 1 on success, 0 on error.
 */
static int
solve_NTRU_deepest(unsigned logn_top,
                   const int8_t *f, const int8_t *g, uint32_t *tmp) {
    size_t len;
    uint32_t *Fp, *Gp, *fp, *gp, *t1, q;
    const small_prime *primes;

    len = MAX_BL_SMALL[logn_top];
    primes = PRIMES;

    Fp = tmp;
    Gp = Fp + len;
    fp = Gp + len;
    gp = fp + len;
    t1 = gp + len;

    make_fg(fp, f, g, logn_top, logn_top, 0);

    /*
     * We use the CRT to rebuild the resultants as big integers.
     * There are two such big integers. The resultants are always
     * nonnegative.
     */
    zint_rebuild_CRT(fp, len, len, 2, primes, 0, t1);

    /*
     * Apply the binary GCD. The zint_bezout() function works only
     * if both inputs are odd.
     *
     * We can test on the result and return 0 because that would
     * imply failure of the NTRU solving equation, and the (f,g)
     * values will be abandoned in that case.
     */
    if (!zint_bezout(Gp, Fp, fp, gp, len, t1)) {
        return 0;
    }

    /*
     * Multiply the two values by the target value q. Values must
     * fit in the destination arrays.
     * We can again test on the returned words: a non-zero output
     * of zint_mul_small() means that we exceeded our array
     * capacity, and that implies failure and rejection of (f,g).
     */
    q = 12289;
    if (zint_mul_small(Fp, len, q) != 0
            || zint_mul_small(Gp, len, q) != 0) {
        return 0;
    }

    return 1;
}

/*
 * Solving the NTRU equation, intermediate level. Upon entry, the F and G
 * from the previous level should be in the tmp[] array.
 * This function MAY be invoked for the top-level (in which case depth = 0).
 *
 * Returned value: 1 on success, 0 on error.
 */
static int
solve_NTRU_intermediate(unsigned logn_top,
                        const int8_t *f, const int8_t *g, unsigned depth, uint32_t *tmp) {
    /*
     * In this function, 'logn' is the log2 of the degree for
     * this step. If N = 2^logn, then:
     *  - the F and G values already in fk->tmp (from the deeper
     *    levels) have degree N/2;
     *  - this function should return F and G of degree N.
     */
    unsigned logn;
    size_t n, hn, slen, dlen, llen, rlen, FGlen, u;
    uint32_t *Fd, *Gd, *Ft, *Gt, *ft, *gt, *t1;
    fpr *rt1, *rt2, *rt3, *rt4, *rt5;
    int scale_fg, minbl_fg, maxbl_fg, maxbl_FG, scale_k;
    uint32_t *x, *y;
    int32_t *k;
    const small_prime *primes;

    logn = logn_top - depth;
    n = (size_t)1 << logn;
    hn = n >> 1;

    /*
     * slen = size for our input f and g; also size of the reduced
     *        F and G we return (degree N)
     *
     * dlen = size of the F and G obtained from the deeper level
     *        (degree N/2 or N/3)
     *
     * llen = size for intermediary F and G before reduction (degree N)
     *
     * We build our non-reduced F and G as two independent halves each,
     * of degree N/2 (F = F0 + X*F1, G = G0 + X*G1).
     */
    slen = MAX_BL_SMALL[depth];
    dlen = MAX_BL_SMALL[depth + 1];
    llen = MAX_BL_LARGE[depth];
    primes = PRIMES;

    /*
     * Fd and Gd are the F and G from the deeper level.
     */
    Fd = tmp;
    Gd = Fd + dlen * hn;

    /*
     * Compute the input f and g for this level. Note that we get f
     * and g in RNS + NTT representation.
     */
    ft = Gd + dlen * hn;
    make_fg(ft, f, g, logn_top, depth, 1);

    /*
     * Move the newly computed f and g to make room for our candidate
     * F and G (unreduced).
     */
    Ft = tmp;
    Gt = Ft + n * llen;
    t1 = Gt + n * llen;
    memmove(t1, ft, 2 * n * slen * sizeof * ft);
    ft = t1;
    gt = ft + slen * n;
    t1 = gt + slen * n;

    /*
     * Move Fd and Gd _after_ f and g.
     */
    memmove(t1, Fd, 2 * hn * dlen * sizeof * Fd);
    Fd = t1;
    Gd = Fd + hn * dlen;

    /*
     * We reduce Fd and Gd modulo all the small primes we will need,
     * and store the values in Ft and Gt (only n/2 values in each).
     */
    for (u = 0; u < llen; u ++) {
        uint32_t p, p0i, R2, Rx;
        size_t v;
        uint32_t *xs, *ys, *xd, *yd;

        p = primes[u].p;
        p0i = modp_ninv31(p);
        R2 = modp_R2(p, p0i);
        Rx = modp_Rx((unsigned)dlen, p, p0i, R2);
        for (v = 0, xs = Fd, ys = Gd, xd = Ft + u, yd = Gt + u;
                v < hn;
                v ++, xs += dlen, ys += dlen, xd += llen, yd += llen) {
            *xd = zint_mod_small_signed(xs, dlen, p, p0i, R2, Rx);
            *yd = zint_mod_small_signed(ys, dlen, p, p0i, R2, Rx);
        }
    }

    /*
     * We do not need Fd and Gd after that point.
     */

    /*
     * Compute our F and G modulo sufficiently many small primes.
     */
    for (u = 0; u < llen; u ++) {
        uint32_t p, p0i, R2;
        uint32_t *gm, *igm, *fx, *gx, *Fp, *Gp;
        size_t v;

        /*
         * All computations are done modulo p.
         */
        p = primes[u].p;
        p0i = modp_ninv31(p);
        R2 = modp_R2(p, p0i);

        /*
         * If we processed slen words, then f and g have been
         * de-NTTized, and are in RNS; we can rebuild them.
         */
        if (u == slen) {
            zint_rebuild_CRT(ft, slen, slen, n, primes, 1, t1);
            zint_rebuild_CRT(gt, slen, slen, n, primes, 1, t1);
        }

        gm = t1;
        igm = gm + n;
        fx = igm + n;
        gx = fx + n;

        modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i);

        if (u < slen) {
            for (v = 0, x = ft + u, y = gt + u;
                    v < n; v ++, x += slen, y += slen) {
                fx[v] = *x;
                gx[v] = *y;
            }
            modp_iNTT2_ext(ft + u, slen, igm, logn, p, p0i);
            modp_iNTT2_ext(gt + u, slen, igm, logn, p, p0i);
        } else {
            uint32_t Rx;

            Rx = modp_Rx((unsigned)slen, p, p0i, R2);
            for (v = 0, x = ft, y = gt;
                    v < n; v ++, x += slen, y += slen) {
                fx[v] = zint_mod_small_signed(x, slen,
                                              p, p0i, R2, Rx);
                gx[v] = zint_mod_small_signed(y, slen,
                                              p, p0i, R2, Rx);
            }
            modp_NTT2(fx, gm, logn, p, p0i);
            modp_NTT2(gx, gm, logn, p, p0i);
        }

        /*
         * Get F' and G' modulo p and in NTT representation
         * (they have degree n/2). These values were computed in
         * a previous step, and stored in Ft and Gt.
         */
        Fp = gx + n;
        Gp = Fp + hn;
        for (v = 0, x = Ft + u, y = Gt + u;
                v < hn; v ++, x += llen, y += llen) {
            Fp[v] = *x;
            Gp[v] = *y;
        }
        modp_NTT2(Fp, gm, logn - 1, p, p0i);
        modp_NTT2(Gp, gm, logn - 1, p, p0i);

        /*
         * Compute our F and G modulo p.
         *
         * General case:
         *
         *   we divide degree by d = 2 or 3
         *   f'(x^d) = N(f)(x^d) = f * adj(f)
         *   g'(x^d) = N(g)(x^d) = g * adj(g)
         *   f'*G' - g'*F' = q
         *   F = F'(x^d) * adj(g)
         *   G = G'(x^d) * adj(f)
         *
         * We compute things in the NTT. We group roots of phi
         * such that all roots x in a group share the same x^d.
         * If the roots in a group are x_1, x_2... x_d, then:
         *
         *   N(f)(x_1^d) = f(x_1)*f(x_2)*...*f(x_d)
         *
         * Thus, we have:
         *
         *   G(x_1) = f(x_2)*f(x_3)*...*f(x_d)*G'(x_1^d)
         *   G(x_2) = f(x_1)*f(x_3)*...*f(x_d)*G'(x_1^d)
         *   ...
         *   G(x_d) = f(x_1)*f(x_2)*...*f(x_{d-1})*G'(x_1^d)
         *
         * In all cases, we can thus compute F and G in NTT
         * representation by a few simple multiplications.
         * Moreover, in our chosen NTT representation, roots
         * from the same group are consecutive in RAM.
         */
        for (v = 0, x = Ft + u, y = Gt + u; v < hn;
                v ++, x += (llen << 1), y += (llen << 1)) {
            uint32_t ftA, ftB, gtA, gtB;
            uint32_t mFp, mGp;

            ftA = fx[(v << 1) + 0];
            ftB = fx[(v << 1) + 1];
            gtA = gx[(v << 1) + 0];
            gtB = gx[(v << 1) + 1];
            mFp = modp_montymul(Fp[v], R2, p, p0i);
            mGp = modp_montymul(Gp[v], R2, p, p0i);
            x[0] = modp_montymul(gtB, mFp, p, p0i);
            x[llen] = modp_montymul(gtA, mFp, p, p0i);
            y[0] = modp_montymul(ftB, mGp, p, p0i);
            y[llen] = modp_montymul(ftA, mGp, p, p0i);
        }
        modp_iNTT2_ext(Ft + u, llen, igm, logn, p, p0i);
        modp_iNTT2_ext(Gt + u, llen, igm, logn, p, p0i);
    }

    /*
     * Rebuild F and G with the CRT.
     */
    zint_rebuild_CRT(Ft, llen, llen, n, primes, 1, t1);
    zint_rebuild_CRT(Gt, llen, llen, n, primes, 1, t1);

    /*
     * At that point, Ft, Gt, ft and gt are consecutive in RAM (in that
     * order).
     */

    /*
     * Apply Babai reduction to bring back F and G to size slen.
     *
     * We use the FFT to compute successive approximations of the
     * reduction coefficient. We first isolate the top bits of
     * the coefficients of f and g, and convert them to floating
     * point; with the FFT, we compute adj(f), adj(g), and
     * 1/(f*adj(f)+g*adj(g)).
     *
     * Then, we repeatedly apply the following:
     *
     *   - Get the top bits of the coefficients of F and G into
     *     floating point, and use the FFT to compute:
     *        (F*adj(f)+G*adj(g))/(f*adj(f)+g*adj(g))
     *
     *   - Convert back that value into normal representation, and
     *     round it to the nearest integers, yielding a polynomial k.
     *     Proper scaling is applied to f, g, F and G so that the
     *     coefficients fit on 32 bits (signed).
     *
     *   - Subtract k*f from F and k*g from G.
     *
     * Under normal conditions, this process reduces the size of F
     * and G by some bits at each iteration. For constant-time
     * operation, we do not want to measure the actual length of
     * F and G; instead, we do the following:
     *
     *   - f and g are converted to floating-point, with some scaling
     *     if necessary to keep values in the representable range.
     *
     *   - For each iteration, we _assume_ a maximum size for F and G,
     *     and use the values at that size. If we overreach, then
     *     we get zeros, which is harmless: the resulting coefficients
     *     of k will be 0 and the value won't be reduced.
     *
     *   - We conservatively assume that F and G will be reduced by
     *     at least 25 bits at each iteration.
     *
     * Even when reaching the bottom of the reduction, reduction
     * coefficient will remain low. If it goes out-of-range, then
     * something wrong occurred and the whole NTRU solving fails.
     */

    /*
     * Memory layout:
     *  - We need to compute and keep adj(f), adj(g), and
     *    1/(f*adj(f)+g*adj(g)) (sizes N, N and N/2 fp numbers,
     *    respectively).
     *  - At each iteration we need two extra fp buffer (N fp values),
     *    and produce a k (N 32-bit words). k will be shared with one
     *    of the fp buffers.
     *  - To compute k*f and k*g efficiently (with the NTT), we need
     *    some extra room; we reuse the space of the temporary buffers.
     *
     * Arrays of 'fpr' are obtained from the temporary array itself.
     * We ensure that the base is at a properly aligned offset (the
     * source array tmp[] is supposed to be already aligned).
     */

    rt3 = align_fpr(tmp, t1);
    rt4 = rt3 + n;
    rt5 = rt4 + n;
    rt1 = rt5 + (n >> 1);
    k = (int32_t *)align_u32(tmp, rt1);
    rt2 = align_fpr(tmp, k + n);
    if (rt2 < (rt1 + n)) {
        rt2 = rt1 + n;
    }
    t1 = (uint32_t *)k + n;

    /*
     * Get f and g into rt3 and rt4 as floating-point approximations.
     *
     * We need to "scale down" the floating-point representation of
     * coefficients when they are too big. We want to keep the value
     * below 2^310 or so. Thus, when values are larger than 10 words,
     * we consider only the top 10 words. Array lengths have been
     * computed so that average maximum length will fall in the
     * middle or the upper half of these top 10 words.
     */
    rlen = (slen > 10) ? 10 : slen;
    poly_big_to_fp(rt3, ft + slen - rlen, rlen, slen, logn);
    poly_big_to_fp(rt4, gt + slen - rlen, rlen, slen, logn);

    /*
     * Values in rt3 and rt4 are downscaled by 2^(scale_fg).
     */
    scale_fg = 31 * (int)(slen - rlen);

    /*
     * Estimated boundaries for the maximum size (in bits) of the
     * coefficients of (f,g). We use the measured average, and
     * allow for a deviation of at most six times the standard
     * deviation.
     */
    minbl_fg = BITLENGTH[depth].avg - 6 * BITLENGTH[depth].std;
    maxbl_fg = BITLENGTH[depth].avg + 6 * BITLENGTH[depth].std;

    /*
     * Compute 1/(f*adj(f)+g*adj(g)) in rt5. We also keep adj(f)
     * and adj(g) in rt3 and rt4, respectively.
     */
    PQCLEAN_FALCON512_AARCH64_FFT(rt3, logn);
    PQCLEAN_FALCON512_AARCH64_poly_adj_fft(rt3, rt3, logn);
    PQCLEAN_FALCON512_AARCH64_FFT(rt4, logn);
    PQCLEAN_FALCON512_AARCH64_poly_adj_fft(rt4, rt4, logn);
    PQCLEAN_FALCON512_AARCH64_poly_invnorm2_fft(rt5, rt3, rt4, logn);

    /*
     * Reduce F and G repeatedly.
     *
     * The expected maximum bit length of coefficients of F and G
     * is kept in maxbl_FG, with the corresponding word length in
     * FGlen.
     */
    FGlen = llen;
    maxbl_FG = 31 * (int)llen;

    /*
     * Each reduction operation computes the reduction polynomial
     * "k". We need that polynomial to have coefficients that fit
     * on 32-bit signed integers, with some scaling; thus, we use
     * a descending sequence of scaling values, down to zero.
     *
     * The size of the coefficients of k is (roughly) the difference
     * between the size of the coefficients of (F,G) and the size
     * of the coefficients of (f,g). Thus, the maximum size of the
     * coefficients of k is, at the start, maxbl_FG - minbl_fg;
     * this is our starting scale value for k.
     *
     * We need to estimate the size of (F,G) during the execution of
     * the algorithm; we are allowed some overestimation but not too
     * much (poly_big_to_fp() uses a 310-bit window). Generally
     * speaking, after applying a reduction with k scaled to
     * scale_k, the size of (F,G) will be size(f,g) + scale_k + dd,
     * where 'dd' is a few bits to account for the fact that the
     * reduction is never perfect (intuitively, dd is on the order
     * of sqrt(N), so at most 5 bits; we here allow for 10 extra
     * bits).
     *
     * The size of (f,g) is not known exactly, but maxbl_fg is an
     * upper bound.
     */
    scale_k = maxbl_FG - minbl_fg;

    for (;;) {
        int scale_FG, dc, new_maxbl_FG;
        uint32_t scl, sch;
        fpr pdc, pt;

        /*
         * Convert current F and G into floating-point. We apply
         * scaling if the current length is more than 10 words.
         */
        rlen = (FGlen > 10) ? 10 : FGlen;
        scale_FG = 31 * (int)(FGlen - rlen);
        poly_big_to_fp(rt1, Ft + FGlen - rlen, rlen, llen, logn);
        poly_big_to_fp(rt2, Gt + FGlen - rlen, rlen, llen, logn);

        /*
         * Compute (F*adj(f)+G*adj(g))/(f*adj(f)+g*adj(g)) in rt2.
         */
        PQCLEAN_FALCON512_AARCH64_FFT(rt1, logn);
        PQCLEAN_FALCON512_AARCH64_poly_mul_fft(rt1, rt1, rt3, logn);
        PQCLEAN_FALCON512_AARCH64_FFT(rt2, logn);
        PQCLEAN_FALCON512_AARCH64_poly_mul_fft(rt2, rt2, rt4, logn);
        PQCLEAN_FALCON512_AARCH64_poly_add(rt2, rt2, rt1, logn);
        PQCLEAN_FALCON512_AARCH64_poly_mul_autoadj_fft(rt2, rt2, rt5, logn);
        PQCLEAN_FALCON512_AARCH64_iFFT(rt2, logn);

        /*
         * (f,g) are scaled by 'scale_fg', meaning that the
         * numbers in rt3/rt4 should be multiplied by 2^(scale_fg)
         * to have their true mathematical value.
         *
         * (F,G) are similarly scaled by 'scale_FG'. Therefore,
         * the value we computed in rt2 is scaled by
         * 'scale_FG-scale_fg'.
         *
         * We want that value to be scaled by 'scale_k', hence we
         * apply a corrective scaling. After scaling, the values
         * should fit in -2^31-1..+2^31-1.
         */
        dc = scale_k - scale_FG + scale_fg;

        /*
         * We will need to multiply values by 2^(-dc). The value
         * 'dc' is not secret, so we can compute 2^(-dc) with a
         * non-constant-time process.
         * (We could use ldexp(), but we prefer to avoid any
         * dependency on libm. When using FP emulation, we could
         * use our fpr_ldexp(), which is constant-time.)
         */
        if (dc < 0) {
            dc = -dc;
            pt = fpr_two;
        } else {
            pt = fpr_onehalf;
        }
        pdc = fpr_one;
        while (dc != 0) {
            if ((dc & 1) != 0) {
                pdc = fpr_mul(pdc, pt);
            }
            dc >>= 1;
            pt = fpr_sqr(pt);
        }

        for (u = 0; u < n; u ++) {
            fpr xv;

            xv = fpr_mul(rt2[u], pdc);

            /*
             * Sometimes the values can be out-of-bounds if
             * the algorithm fails; we must not call
             * fpr_rint() (and cast to int32_t) if the value
             * is not in-bounds. Note that the test does not
             * break constant-time discipline, since any
             * failure here implies that we discard the current
             * secret key (f,g).
             */
            if (!fpr_lt(fpr_mtwo31m1, xv)
                    || !fpr_lt(xv, fpr_ptwo31m1)) {
                return 0;
            }
            k[u] = (int32_t)fpr_rint(xv);
        }

        /*
         * Values in k[] are integers. They really are scaled
         * down by maxbl_FG - minbl_fg bits.
         *
         * If we are at low depth, then we use the NTT to
         * compute k*f and k*g.
         */
        sch = (uint32_t)(scale_k / 31);
        scl = (uint32_t)(scale_k % 31);
        if (depth <= DEPTH_INT_FG) {
            poly_sub_scaled_ntt(Ft, FGlen, llen, ft, slen, slen,
                                k, sch, scl, logn, t1);
            poly_sub_scaled_ntt(Gt, FGlen, llen, gt, slen, slen,
                                k, sch, scl, logn, t1);
        } else {
            poly_sub_scaled(Ft, FGlen, llen, ft, slen, slen,
                            k, sch, scl, logn);
            poly_sub_scaled(Gt, FGlen, llen, gt, slen, slen,
                            k, sch, scl, logn);
        }

        /*
         * We compute the new maximum size of (F,G), assuming that
         * (f,g) has _maximal_ length (i.e. that reduction is
         * "late" instead of "early". We also adjust FGlen
         * accordingly.
         */
        new_maxbl_FG = scale_k + maxbl_fg + 10;
        if (new_maxbl_FG < maxbl_FG) {
            maxbl_FG = new_maxbl_FG;
            if ((int)FGlen * 31 >= maxbl_FG + 31) {
                FGlen --;
            }
        }

        /*
         * We suppose that scaling down achieves a reduction by
         * at least 25 bits per iteration. We stop when we have
         * done the loop with an unscaled k.
         */
        if (scale_k <= 0) {
            break;
        }
        scale_k -= 25;
        if (scale_k < 0) {
            scale_k = 0;
        }
    }

    /*
     * If (F,G) length was lowered below 'slen', then we must take
     * care to re-extend the sign.
     */
    if (FGlen < slen) {
        for (u = 0; u < n; u ++, Ft += llen, Gt += llen) {
            size_t v;
            uint32_t sw;

            sw = -(Ft[FGlen - 1] >> 30) >> 1;
            for (v = FGlen; v < slen; v ++) {
                Ft[v] = sw;
            }
            sw = -(Gt[FGlen - 1] >> 30) >> 1;
            for (v = FGlen; v < slen; v ++) {
                Gt[v] = sw;
            }
        }
    }

    /*
     * Compress encoding of all values to 'slen' words (this is the
     * expected output format).
     */
    for (u = 0, x = tmp, y = tmp;
            u < (n << 1); u ++, x += slen, y += llen) {
        memmove(x, y, slen * sizeof * y);
    }
    return 1;
}

/*
 * Solving the NTRU equation, binary case, depth = 1. Upon entry, the
 * F and G from the previous level should be in the tmp[] array.
 *
 * Returned value: 1 on success, 0 on error.
 */
static int
solve_NTRU_binary_depth1(unsigned logn_top,
                         const int8_t *f, const int8_t *g, uint32_t *tmp) {
    /*
     * The first half of this function is a copy of the corresponding
     * part in solve_NTRU_intermediate(), for the reconstruction of
     * the unreduced F and G. The second half (Babai reduction) is
     * done differently, because the unreduced F and G fit in 53 bits
     * of precision, allowing a much simpler process with lower RAM
     * usage.
     */
    unsigned depth, logn;
    size_t n_top, n, hn, slen, dlen, llen, u;
    uint32_t *Fd, *Gd, *Ft, *Gt, *ft, *gt, *t1;
    fpr *rt1, *rt2, *rt3, *rt4, *rt5, *rt6;
    uint32_t *x, *y;

    depth = 1;
    n_top = (size_t)1 << logn_top;
    logn = logn_top - depth;
    n = (size_t)1 << logn;
    hn = n >> 1;

    /*
     * Equations are:
     *
     *   f' = f0^2 - X^2*f1^2
     *   g' = g0^2 - X^2*g1^2
     *   F' and G' are a solution to f'G' - g'F' = q (from deeper levels)
     *   F = F'*(g0 - X*g1)
     *   G = G'*(f0 - X*f1)
     *
     * f0, f1, g0, g1, f', g', F' and G' are all "compressed" to
     * degree N/2 (their odd-indexed coefficients are all zero).
     */

    /*
     * slen = size for our input f and g; also size of the reduced
     *        F and G we return (degree N)
     *
     * dlen = size of the F and G obtained from the deeper level
     *        (degree N/2)
     *
     * llen = size for intermediary F and G before reduction (degree N)
     *
     * We build our non-reduced F and G as two independent halves each,
     * of degree N/2 (F = F0 + X*F1, G = G0 + X*G1).
     */
    slen = MAX_BL_SMALL[depth];
    dlen = MAX_BL_SMALL[depth + 1];
    llen = MAX_BL_LARGE[depth];

    /*
     * Fd and Gd are the F and G from the deeper level. Ft and Gt
     * are the destination arrays for the unreduced F and G.
     */
    Fd = tmp;
    Gd = Fd + dlen * hn;
    Ft = Gd + dlen * hn;
    Gt = Ft + llen * n;

    /*
     * We reduce Fd and Gd modulo all the small primes we will need,
     * and store the values in Ft and Gt.
     */
    for (u = 0; u < llen; u ++) {
        uint32_t p, p0i, R2, Rx;
        size_t v;
        uint32_t *xs, *ys, *xd, *yd;

        p = PRIMES[u].p;
        p0i = modp_ninv31(p);
        R2 = modp_R2(p, p0i);
        Rx = modp_Rx((unsigned)dlen, p, p0i, R2);
        for (v = 0, xs = Fd, ys = Gd, xd = Ft + u, yd = Gt + u;
                v < hn;
                v ++, xs += dlen, ys += dlen, xd += llen, yd += llen) {
            *xd = zint_mod_small_signed(xs, dlen, p, p0i, R2, Rx);
            *yd = zint_mod_small_signed(ys, dlen, p, p0i, R2, Rx);
        }
    }

    /*
     * Now Fd and Gd are not needed anymore; we can squeeze them out.
     */
    memmove(tmp, Ft, llen * n * sizeof(uint32_t));
    Ft = tmp;
    memmove(Ft + llen * n, Gt, llen * n * sizeof(uint32_t));
    Gt = Ft + llen * n;
    ft = Gt + llen * n;
    gt = ft + slen * n;

    t1 = gt + slen * n;

    /*
     * Compute our F and G modulo sufficiently many small primes.
     */
    for (u = 0; u < llen; u ++) {
        uint32_t p, p0i, R2;
        uint32_t *gm, *igm, *fx, *gx, *Fp, *Gp;
        unsigned e;
        size_t v;

        /*
         * All computations are done modulo p.
         */
        p = PRIMES[u].p;
        p0i = modp_ninv31(p);
        R2 = modp_R2(p, p0i);

        /*
         * We recompute things from the source f and g, of full
         * degree. However, we will need only the n first elements
         * of the inverse NTT table (igm); the call to modp_mkgm()
         * below will fill n_top elements in igm[] (thus overflowing
         * into fx[]) but later code will overwrite these extra
         * elements.
         */
        gm = t1;
        igm = gm + n_top;
        fx = igm + n;
        gx = fx + n_top;
        modp_mkgm2(gm, igm, logn_top, PRIMES[u].g, p, p0i);

        /*
         * Set ft and gt to f and g modulo p, respectively.
         */
        for (v = 0; v < n_top; v ++) {
            fx[v] = modp_set(f[v], p);
            gx[v] = modp_set(g[v], p);
        }

        /*
         * Convert to NTT and compute our f and g.
         */
        modp_NTT2(fx, gm, logn_top, p, p0i);
        modp_NTT2(gx, gm, logn_top, p, p0i);
        for (e = logn_top; e > logn; e --) {
            modp_poly_rec_res(fx, e, p, p0i, R2);
            modp_poly_rec_res(gx, e, p, p0i, R2);
        }

        /*
         * From that point onward, we only need tables for
         * degree n, so we can save some space.
         */
        if (depth > 0) { /* always true */
            memmove(gm + n, igm, n * sizeof * igm);
            igm = gm + n;
            memmove(igm + n, fx, n * sizeof * ft);
            fx = igm + n;
            memmove(fx + n, gx, n * sizeof * gt);
            gx = fx + n;
        }

        /*
         * Get F' and G' modulo p and in NTT representation
         * (they have degree n/2). These values were computed
         * in a previous step, and stored in Ft and Gt.
         */
        Fp = gx + n;
        Gp = Fp + hn;
        for (v = 0, x = Ft + u, y = Gt + u;
                v < hn; v ++, x += llen, y += llen) {
            Fp[v] = *x;
            Gp[v] = *y;
        }
        modp_NTT2(Fp, gm, logn - 1, p, p0i);
        modp_NTT2(Gp, gm, logn - 1, p, p0i);

        /*
         * Compute our F and G modulo p.
         *
         * Equations are:
         *
         *   f'(x^2) = N(f)(x^2) = f * adj(f)
         *   g'(x^2) = N(g)(x^2) = g * adj(g)
         *
         *   f'*G' - g'*F' = q
         *
         *   F = F'(x^2) * adj(g)
         *   G = G'(x^2) * adj(f)
         *
         * The NTT representation of f is f(w) for all w which
         * are roots of phi. In the binary case, as well as in
         * the ternary case for all depth except the deepest,
         * these roots can be grouped in pairs (w,-w), and we
         * then have:
         *
         *   f(w) = adj(f)(-w)
         *   f(-w) = adj(f)(w)
         *
         * and w^2 is then a root for phi at the half-degree.
         *
         * At the deepest level in the ternary case, this still
         * holds, in the following sense: the roots of x^2-x+1
         * are (w,-w^2) (for w^3 = -1, and w != -1), and we
         * have:
         *
         *   f(w) = adj(f)(-w^2)
         *   f(-w^2) = adj(f)(w)
         *
         * In all case, we can thus compute F and G in NTT
         * representation by a few simple multiplications.
         * Moreover, the two roots for each pair are consecutive
         * in our bit-reversal encoding.
         */
        for (v = 0, x = Ft + u, y = Gt + u;
                v < hn; v ++, x += (llen << 1), y += (llen << 1)) {
            uint32_t ftA, ftB, gtA, gtB;
            uint32_t mFp, mGp;

            ftA = fx[(v << 1) + 0];
            ftB = fx[(v << 1) + 1];
            gtA = gx[(v << 1) + 0];
            gtB = gx[(v << 1) + 1];
            mFp = modp_montymul(Fp[v], R2, p, p0i);
            mGp = modp_montymul(Gp[v], R2, p, p0i);
            x[0] = modp_montymul(gtB, mFp, p, p0i);
            x[llen] = modp_montymul(gtA, mFp, p, p0i);
            y[0] = modp_montymul(ftB, mGp, p, p0i);
            y[llen] = modp_montymul(ftA, mGp, p, p0i);
        }
        modp_iNTT2_ext(Ft + u, llen, igm, logn, p, p0i);
        modp_iNTT2_ext(Gt + u, llen, igm, logn, p, p0i);

        /*
         * Also save ft and gt (only up to size slen).
         */
        if (u < slen) {
            modp_iNTT2(fx, igm, logn, p, p0i);
            modp_iNTT2(gx, igm, logn, p, p0i);
            for (v = 0, x = ft + u, y = gt + u;
                    v < n; v ++, x += slen, y += slen) {
                *x = fx[v];
                *y = gx[v];
            }
        }
    }

    /*
     * Rebuild f, g, F and G with the CRT. Note that the elements of F
     * and G are consecutive, and thus can be rebuilt in a single
     * loop; similarly, the elements of f and g are consecutive.
     */
    zint_rebuild_CRT(Ft, llen, llen, n << 1, PRIMES, 1, t1);
    zint_rebuild_CRT(ft, slen, slen, n << 1, PRIMES, 1, t1);

    /*
     * Here starts the Babai reduction, specialized for depth = 1.
     *
     * Candidates F and G (from Ft and Gt), and base f and g (ft and gt),
     * are converted to floating point. There is no scaling, and a
     * single pass is sufficient.
     */

    /*
     * Convert F and G into floating point (rt1 and rt2).
     */
    rt1 = align_fpr(tmp, gt + slen * n);
    rt2 = rt1 + n;
    poly_big_to_fp(rt1, Ft, llen, llen, logn);
    poly_big_to_fp(rt2, Gt, llen, llen, logn);

    /*
     * Integer representation of F and G is no longer needed, we
     * can remove it.
     */
    memmove(tmp, ft, 2 * slen * n * sizeof * ft);
    ft = tmp;
    gt = ft + slen * n;
    rt3 = align_fpr(tmp, gt + slen * n);
    memmove(rt3, rt1, 2 * n * sizeof * rt1);
    rt1 = rt3;
    rt2 = rt1 + n;
    rt3 = rt2 + n;
    rt4 = rt3 + n;

    /*
     * Convert f and g into floating point (rt3 and rt4).
     */
    poly_big_to_fp(rt3, ft, slen, slen, logn);
    poly_big_to_fp(rt4, gt, slen, slen, logn);

    /*
     * Remove unneeded ft and gt.
     */
    memmove(tmp, rt1, 4 * n * sizeof * rt1);
    rt1 = (fpr *)tmp;
    rt2 = rt1 + n;
    rt3 = rt2 + n;
    rt4 = rt3 + n;
    rt5 = rt4 + n;
    rt6 = rt5 + n;

    /*
     * We now have:
     *   rt1 = F
     *   rt2 = G
     *   rt3 = f
     *   rt4 = g
     * in that order in RAM. We convert all of them to FFT.
     */
    PQCLEAN_FALCON512_AARCH64_FFT(rt1, logn);
    PQCLEAN_FALCON512_AARCH64_FFT(rt2, logn);
    PQCLEAN_FALCON512_AARCH64_FFT(rt3, logn);
    PQCLEAN_FALCON512_AARCH64_FFT(rt4, logn);

    /*
     * Compute:
     *   rt5 = F*adj(f) + G*adj(g)
     *   rt6 = 1 / (f*adj(f) + g*adj(g))
     * (Note that rt6 is half-length.)
     */
    PQCLEAN_FALCON512_AARCH64_poly_add_muladj_fft(rt5, rt1, rt2, rt3, rt4, logn);
    PQCLEAN_FALCON512_AARCH64_poly_invnorm2_fft(rt6, rt3, rt4, logn);

    /*
     * Compute:
     *   rt5 = (F*adj(f)+G*adj(g)) / (f*adj(f)+g*adj(g))
     */
    PQCLEAN_FALCON512_AARCH64_poly_mul_autoadj_fft(rt5, rt5, rt6, logn);

    /*
     * Compute k as the rounded version of rt5. Check that none of
     * the values is larger than 2^63-1 (in absolute value)
     * because that would make the fpr_rint() do something undefined;
     * note that any out-of-bounds value here implies a failure and
     * (f,g) will be discarded, so we can make a simple test.
     */
    PQCLEAN_FALCON512_AARCH64_iFFT(rt5, logn);
    for (u = 0; u < n; u ++) {
        fpr z;

        z = rt5[u];
        if (!fpr_lt(z, fpr_ptwo63m1) || !fpr_lt(fpr_mtwo63m1, z)) {
            return 0;
        }
        rt5[u] = fpr_of(fpr_rint(z));
    }
    PQCLEAN_FALCON512_AARCH64_FFT(rt5, logn);

    /*
     * Subtract k*f from F, and k*g from G.
     */
    PQCLEAN_FALCON512_AARCH64_poly_mul_fft(rt3, rt3, rt5, logn);
    PQCLEAN_FALCON512_AARCH64_poly_sub(rt1, rt1, rt3, logn);
    PQCLEAN_FALCON512_AARCH64_iFFT(rt1, logn);

    PQCLEAN_FALCON512_AARCH64_poly_mul_fft(rt4, rt4, rt5, logn);
    PQCLEAN_FALCON512_AARCH64_poly_sub(rt2, rt2, rt4, logn);
    PQCLEAN_FALCON512_AARCH64_iFFT(rt2, logn);

    /*
     * Convert back F and G to integers, and return.
     */
    Ft = tmp;
    Gt = Ft + n;
    rt3 = align_fpr(tmp, Gt + n);
    memmove(rt3, rt1, 2 * n * sizeof * rt1);
    rt1 = rt3;
    rt2 = rt1 + n;
    for (u = 0; u < n; u ++) {
        Ft[u] = (uint32_t)fpr_rint(rt1[u]);
        Gt[u] = (uint32_t)fpr_rint(rt2[u]);
    }

    return 1;
}

/*
 * Solving the NTRU equation, top level. Upon entry, the F and G
 * from the previous level should be in the tmp[] array.
 *
 * Returned value: 1 on success, 0 on error.
 */
static int
solve_NTRU_binary_depth0(unsigned logn,
                         const int8_t *f, const int8_t *g, uint32_t *tmp) {
    size_t n, hn, u;
    uint32_t p, p0i, R2;
    uint32_t *Fp, *Gp, *t1, *t2, *t3, *t4, *t5;
    uint32_t *gm, *igm, *ft, *gt;
    fpr *rt2, *rt3;

    n = (size_t)1 << logn;
    hn = n >> 1;

    /*
     * Equations are:
     *
     *   f' = f0^2 - X^2*f1^2
     *   g' = g0^2 - X^2*g1^2
     *   F' and G' are a solution to f'G' - g'F' = q (from deeper levels)
     *   F = F'*(g0 - X*g1)
     *   G = G'*(f0 - X*f1)
     *
     * f0, f1, g0, g1, f', g', F' and G' are all "compressed" to
     * degree N/2 (their odd-indexed coefficients are all zero).
     *
     * Everything should fit in 31-bit integers, hence we can just use
     * the first small prime p = 2147473409.
     */
    p = PRIMES[0].p;
    p0i = modp_ninv31(p);
    R2 = modp_R2(p, p0i);

    Fp = tmp;
    Gp = Fp + hn;
    ft = Gp + hn;
    gt = ft + n;
    gm = gt + n;
    igm = gm + n;

    modp_mkgm2(gm, igm, logn, PRIMES[0].g, p, p0i);

    /*
     * Convert F' anf G' in NTT representation.
     */
    for (u = 0; u < hn; u ++) {
        Fp[u] = modp_set(zint_one_to_plain(Fp + u), p);
        Gp[u] = modp_set(zint_one_to_plain(Gp + u), p);
    }
    modp_NTT2(Fp, gm, logn - 1, p, p0i);
    modp_NTT2(Gp, gm, logn - 1, p, p0i);

    /*
     * Load f and g and convert them to NTT representation.
     */
    for (u = 0; u < n; u ++) {
        ft[u] = modp_set(f[u], p);
        gt[u] = modp_set(g[u], p);
    }
    modp_NTT2(ft, gm, logn, p, p0i);
    modp_NTT2(gt, gm, logn, p, p0i);

    /*
     * Build the unreduced F,G in ft and gt.
     */
    for (u = 0; u < n; u += 2) {
        uint32_t ftA, ftB, gtA, gtB;
        uint32_t mFp, mGp;

        ftA = ft[u + 0];
        ftB = ft[u + 1];
        gtA = gt[u + 0];
        gtB = gt[u + 1];
        mFp = modp_montymul(Fp[u >> 1], R2, p, p0i);
        mGp = modp_montymul(Gp[u >> 1], R2, p, p0i);
        ft[u + 0] = modp_montymul(gtB, mFp, p, p0i);
        ft[u + 1] = modp_montymul(gtA, mFp, p, p0i);
        gt[u + 0] = modp_montymul(ftB, mGp, p, p0i);
        gt[u + 1] = modp_montymul(ftA, mGp, p, p0i);
    }
    modp_iNTT2(ft, igm, logn, p, p0i);
    modp_iNTT2(gt, igm, logn, p, p0i);

    Gp = Fp + n;
    t1 = Gp + n;
    memmove(Fp, ft, 2 * n * sizeof * ft);

    /*
     * We now need to apply the Babai reduction. At that point,
     * we have F and G in two n-word arrays.
     *
     * We can compute F*adj(f)+G*adj(g) and f*adj(f)+g*adj(g)
     * modulo p, using the NTT. We still move memory around in
     * order to save RAM.
     */
    t2 = t1 + n;
    t3 = t2 + n;
    t4 = t3 + n;
    t5 = t4 + n;

    /*
     * Compute the NTT tables in t1 and t2. We do not keep t2
     * (we'll recompute it later on).
     */
    modp_mkgm2(t1, t2, logn, PRIMES[0].g, p, p0i);

    /*
     * Convert F and G to NTT.
     */
    modp_NTT2(Fp, t1, logn, p, p0i);
    modp_NTT2(Gp, t1, logn, p, p0i);

    /*
     * Load f and adj(f) in t4 and t5, and convert them to NTT
     * representation.
     */
    t4[0] = t5[0] = modp_set(f[0], p);
    for (u = 1; u < n; u ++) {
        t4[u] = modp_set(f[u], p);
        t5[n - u] = modp_set(-f[u], p);
    }
    modp_NTT2(t4, t1, logn, p, p0i);
    modp_NTT2(t5, t1, logn, p, p0i);

    /*
     * Compute F*adj(f) in t2, and f*adj(f) in t3.
     */
    for (u = 0; u < n; u ++) {
        uint32_t w;

        w = modp_montymul(t5[u], R2, p, p0i);
        t2[u] = modp_montymul(w, Fp[u], p, p0i);
        t3[u] = modp_montymul(w, t4[u], p, p0i);
    }

    /*
     * Load g and adj(g) in t4 and t5, and convert them to NTT
     * representation.
     */
    t4[0] = t5[0] = modp_set(g[0], p);
    for (u = 1; u < n; u ++) {
        t4[u] = modp_set(g[u], p);
        t5[n - u] = modp_set(-g[u], p);
    }
    modp_NTT2(t4, t1, logn, p, p0i);
    modp_NTT2(t5, t1, logn, p, p0i);

    /*
     * Add G*adj(g) to t2, and g*adj(g) to t3.
     */
    for (u = 0; u < n; u ++) {
        uint32_t w;

        w = modp_montymul(t5[u], R2, p, p0i);
        t2[u] = modp_add(t2[u],
                         modp_montymul(w, Gp[u], p, p0i), p);
        t3[u] = modp_add(t3[u],
                         modp_montymul(w, t4[u], p, p0i), p);
    }

    /*
     * Convert back t2 and t3 to normal representation (normalized
     * around 0), and then
     * move them to t1 and t2. We first need to recompute the
     * inverse table for NTT.
     */
    modp_mkgm2(t1, t4, logn, PRIMES[0].g, p, p0i);
    modp_iNTT2(t2, t4, logn, p, p0i);
    modp_iNTT2(t3, t4, logn, p, p0i);
    for (u = 0; u < n; u ++) {
        t1[u] = (uint32_t)modp_norm(t2[u], p);
        t2[u] = (uint32_t)modp_norm(t3[u], p);
    }

    /*
     * At that point, array contents are:
     *
     *   F (NTT representation) (Fp)
     *   G (NTT representation) (Gp)
     *   F*adj(f)+G*adj(g) (t1)
     *   f*adj(f)+g*adj(g) (t2)
     *
     * We want to divide t1 by t2. The result is not integral; it
     * must be rounded. We thus need to use the FFT.
     */

    /*
     * Get f*adj(f)+g*adj(g) in FFT representation. Since this
     * polynomial is auto-adjoint, all its coordinates in FFT
     * representation are actually real, so we can truncate off
     * the imaginary parts.
     */
    rt3 = align_fpr(tmp, t3);
    for (u = 0; u < n; u ++) {
        rt3[u] = fpr_of(((int32_t *)t2)[u]);
    }
    PQCLEAN_FALCON512_AARCH64_FFT(rt3, logn);
    rt2 = align_fpr(tmp, t2);
    memmove(rt2, rt3, hn * sizeof * rt3);

    /*
     * Convert F*adj(f)+G*adj(g) in FFT representation.
     */
    rt3 = rt2 + hn;
    for (u = 0; u < n; u ++) {
        rt3[u] = fpr_of(((int32_t *)t1)[u]);
    }
    PQCLEAN_FALCON512_AARCH64_FFT(rt3, logn);

    /*
     * Compute (F*adj(f)+G*adj(g))/(f*adj(f)+g*adj(g)) and get
     * its rounded normal representation in t1.
     */
    PQCLEAN_FALCON512_AARCH64_poly_div_autoadj_fft(rt3, rt3, rt2, logn);
    PQCLEAN_FALCON512_AARCH64_iFFT(rt3, logn);
    for (u = 0; u < n; u ++) {
        t1[u] = modp_set((int32_t)fpr_rint(rt3[u]), p);
    }

    /*
     * RAM contents are now:
     *
     *   F (NTT representation) (Fp)
     *   G (NTT representation) (Gp)
     *   k (t1)
     *
     * We want to compute F-k*f, and G-k*g.
     */
    t2 = t1 + n;
    t3 = t2 + n;
    t4 = t3 + n;
    t5 = t4 + n;
    modp_mkgm2(t2, t3, logn, PRIMES[0].g, p, p0i);
    for (u = 0; u < n; u ++) {
        t4[u] = modp_set(f[u], p);
        t5[u] = modp_set(g[u], p);
    }
    modp_NTT2(t1, t2, logn, p, p0i);
    modp_NTT2(t4, t2, logn, p, p0i);
    modp_NTT2(t5, t2, logn, p, p0i);
    for (u = 0; u < n; u ++) {
        uint32_t kw;

        kw = modp_montymul(t1[u], R2, p, p0i);
        Fp[u] = modp_sub(Fp[u],
                         modp_montymul(kw, t4[u], p, p0i), p);
        Gp[u] = modp_sub(Gp[u],
                         modp_montymul(kw, t5[u], p, p0i), p);
    }
    modp_iNTT2(Fp, t3, logn, p, p0i);
    modp_iNTT2(Gp, t3, logn, p, p0i);
    for (u = 0; u < n; u ++) {
        Fp[u] = (uint32_t)modp_norm(Fp[u], p);
        Gp[u] = (uint32_t)modp_norm(Gp[u], p);
    }

    return 1;
}

/*
 * Solve the NTRU equation. Returned value is 1 on success, 0 on error.
 * G can be NULL, in which case that value is computed but not returned.
 * If any of the coefficients of F and G exceeds lim (in absolute value),
 * then 0 is returned.
 */
static int
solve_NTRU(unsigned logn, int8_t *F, int8_t *G,
           const int8_t *f, const int8_t *g, int lim, uint32_t *tmp) {
    size_t n, u;
    uint32_t *ft, *gt, *Ft, *Gt, *gm;
    uint32_t p, p0i, r;
    const small_prime *primes;

    n = MKN(logn);

    if (!solve_NTRU_deepest(logn, f, g, tmp)) {
        return 0;
    }

    /*
     * For logn <= 2, we need to use solve_NTRU_intermediate()
     * directly, because coefficients are a bit too large and
     * do not fit the hypotheses in solve_NTRU_binary_depth0().
     */
    if (logn <= 2) {
        unsigned depth;

        depth = logn;
        while (depth -- > 0) {
            if (!solve_NTRU_intermediate(logn, f, g, depth, tmp)) {
                return 0;
            }
        }
    } else {
        unsigned depth;

        depth = logn;
        while (depth -- > 2) {
            if (!solve_NTRU_intermediate(logn, f, g, depth, tmp)) {
                return 0;
            }
        }
        if (!solve_NTRU_binary_depth1(logn, f, g, tmp)) {
            return 0;
        }
        if (!solve_NTRU_binary_depth0(logn, f, g, tmp)) {
            return 0;
        }
    }

    /*
     * If no buffer has been provided for G, use a temporary one.
     */
    if (G == NULL) {
        G = (int8_t *)(tmp + 2 * n);
    }

    /*
     * Final F and G are in fk->tmp, one word per coefficient
     * (signed value over 31 bits).
     */
    if (!poly_big_to_small(F, tmp, lim, logn)
            || !poly_big_to_small(G, tmp + n, lim, logn)) {
        return 0;
    }

    /*
     * Verify that the NTRU equation is fulfilled. Since all elements
     * have short lengths, verifying modulo a small prime p works, and
     * allows using the NTT.
     *
     * We put Gt[] first in tmp[], and process it first, so that it does
     * not overlap with G[] in case we allocated it ourselves.
     */
    Gt = tmp;
    ft = Gt + n;
    gt = ft + n;
    Ft = gt + n;
    gm = Ft + n;

    primes = PRIMES;
    p = primes[0].p;
    p0i = modp_ninv31(p);
    modp_mkgm2(gm, tmp, logn, primes[0].g, p, p0i);
    for (u = 0; u < n; u ++) {
        Gt[u] = modp_set(G[u], p);
    }
    for (u = 0; u < n; u ++) {
        ft[u] = modp_set(f[u], p);
        gt[u] = modp_set(g[u], p);
        Ft[u] = modp_set(F[u], p);
    }
    modp_NTT2(ft, gm, logn, p, p0i);
    modp_NTT2(gt, gm, logn, p, p0i);
    modp_NTT2(Ft, gm, logn, p, p0i);
    modp_NTT2(Gt, gm, logn, p, p0i);
    r = modp_montymul(12289, 1, p, p0i);
    for (u = 0; u < n; u ++) {
        uint32_t z;

        z = modp_sub(modp_montymul(ft[u], Gt[u], p, p0i),
                     modp_montymul(gt[u], Ft[u], p, p0i), p);
        if (z != r) {
            return 0;
        }
    }

    return 1;
}

/*
 * Generate a random polynomial with a Gaussian distribution. This function
 * also makes sure that the resultant of the polynomial with phi is odd.
 */
static void
poly_small_mkgauss(RNG_CONTEXT *rng, int8_t *f, unsigned logn) {
    size_t n, u;
    unsigned mod2;

    n = MKN(logn);
    mod2 = 0;
    for (u = 0; u < n; u ++) {
        int s;

restart:
        s = mkgauss(rng, logn);

        /*
         * We need the coefficient to fit within -127..+127;
         * realistically, this is always the case except for
         * the very low degrees (N = 2 or 4), for which there
         * is no real security anyway.
         */
        if (s < -127 || s > 127) {
            goto restart;
        }

        /*
         * We need the sum of all coefficients to be 1; otherwise,
         * the resultant of the polynomial with X^N+1 will be even,
         * and the binary GCD will fail.
         */
        if (u == n - 1) {
            if ((mod2 ^ (unsigned)(s & 1)) == 0) {
                goto restart;
            }
        } else {
            mod2 ^= (unsigned)(s & 1);
        }
        f[u] = (int8_t)s;
    }
}

/* see falcon.h */
void
PQCLEAN_FALCON512_AARCH64_keygen(inner_shake256_context *rng,
                                 int8_t *f, int8_t *g, int8_t *F, int8_t *G, uint16_t *h,
                                 unsigned logn, uint8_t *tmp) {
    /*
     * Algorithm is the following:
     *
     *  - Generate f and g with the Gaussian distribution.
     *
     *  - If either Res(f,phi) or Res(g,phi) is even, try again.
     *
     *  - If ||(f,g)|| is too large, try again.
     *
     *  - If ||B~_{f,g}|| is too large, try again.
     *
     *  - If f is not invertible mod phi mod q, try again.
     *
     *  - Compute h = g/f mod phi mod q.
     *
     *  - Solve the NTRU equation fG - gF = q; if the solving fails,
     *    try again. Usual failure condition is when Res(f,phi)
     *    and Res(g,phi) are not prime to each other.
     */
    size_t n, u;
    int16_t *h2, *tmp2;
    RNG_CONTEXT *rc;

    n = MKN(logn);
    rc = rng;

    /*
     * We need to generate f and g randomly, until we find values
     * such that the norm of (g,-f), and of the orthogonalized
     * vector, are satisfying. The orthogonalized vector is:
     *   (q*adj(f)/(f*adj(f)+g*adj(g)), q*adj(g)/(f*adj(f)+g*adj(g)))
     * (it is actually the (N+1)-th row of the Gram-Schmidt basis).
     *
     * In the binary case, coefficients of f and g are generated
     * independently of each other, with a discrete Gaussian
     * distribution of standard deviation 1.17*sqrt(q/(2*N)). Then,
     * the two vectors have expected norm 1.17*sqrt(q), which is
     * also our acceptance bound: we require both vectors to be no
     * larger than that (this will be satisfied about 1/4th of the
     * time, thus we expect sampling new (f,g) about 4 times for that
     * step).
     *
     * We require that Res(f,phi) and Res(g,phi) are both odd (the
     * NTRU equation solver requires it).
     */
    for (;;) {
        fpr *rt1, *rt2, *rt3;
        fpr bnorm;
        uint32_t normf, normg, norm;
        int lim;

        /*
         * The poly_small_mkgauss() function makes sure
         * that the sum of coefficients is 1 modulo 2
         * (i.e. the resultant of the polynomial with phi
         * will be odd).
         */
        poly_small_mkgauss(rc, f, logn);
        poly_small_mkgauss(rc, g, logn);

        /*
         * Verify that all coefficients are within the bounds
         * defined in max_fg_bits. This is the case with
         * overwhelming probability; this guarantees that the
         * key will be encodable with FALCON_COMP_TRIM.
         */
        lim = 1 << (PQCLEAN_FALCON512_AARCH64_max_fg_bits[logn] - 1);
        for (u = 0; u < n; u ++) {
            /*
             * We can use non-CT tests since on any failure
             * we will discard f and g.
             */
            if (f[u] >= lim || f[u] <= -lim
                    || g[u] >= lim || g[u] <= -lim) {
                lim = -1;
                break;
            }
        }
        if (lim < 0) {
            continue;
        }

        /*
         * Bound is 1.17*sqrt(q). We compute the squared
         * norms. With q = 12289, the squared bound is:
         *   (1.17^2)* 12289 = 16822.4121
         * Since f and g are integral, the squared norm
         * of (g,-f) is an integer.
         */
        normf = poly_small_sqnorm(f, logn);
        normg = poly_small_sqnorm(g, logn);
        norm = (normf + normg) | -((normf | normg) >> 31);
        if (norm >= 16823) {
            continue;
        }

        /*
         * We compute the orthogonalized vector norm.
         */
        rt1 = (fpr *)tmp;
        rt2 = rt1 + n;
        rt3 = rt2 + n;

        poly_small_to_fp(rt1, f, logn);
        PQCLEAN_FALCON512_AARCH64_FFT(rt1, logn);
        PQCLEAN_FALCON512_AARCH64_poly_adj_fft(rt1, rt1, logn);

        poly_small_to_fp(rt2, g, logn);
        PQCLEAN_FALCON512_AARCH64_FFT(rt2, logn);
        PQCLEAN_FALCON512_AARCH64_poly_adj_fft(rt2, rt2, logn);

        PQCLEAN_FALCON512_AARCH64_poly_invnorm2_fft(rt3, rt1, rt2, logn);

        PQCLEAN_FALCON512_AARCH64_poly_mulconst(rt1, rt1, fpr_q, logn);
        PQCLEAN_FALCON512_AARCH64_poly_mul_autoadj_fft(rt1, rt1, rt3, logn);
        PQCLEAN_FALCON512_AARCH64_iFFT(rt1, logn);

        PQCLEAN_FALCON512_AARCH64_poly_mulconst(rt2, rt2, fpr_q, logn);
        PQCLEAN_FALCON512_AARCH64_poly_mul_autoadj_fft(rt2, rt2, rt3, logn);
        PQCLEAN_FALCON512_AARCH64_iFFT(rt2, logn);

        bnorm = PQCLEAN_FALCON512_AARCH64_compute_bnorm(rt1, rt2);

        if (!fpr_lt(bnorm, fpr_bnorm_max)) {
            continue;
        }

        /*
         * Compute public key h = g/f mod X^N+1 mod q. If this
         * fails, we must restart.
         */
        if (h == NULL) {
            h2 = (int16_t *)tmp;
            tmp2 = h2 + n;
        } else {
            h2 = (int16_t *)h;
            tmp2 = (int16_t *)tmp;
        }

        if (!PQCLEAN_FALCON512_AARCH64_compute_public(h2, f, g, tmp2)) {
            continue;
        }

        /*
         * Solve the NTRU equation to get F and G.
         */
        lim = (1 << (PQCLEAN_FALCON512_AARCH64_max_FG_bits[logn] - 1)) - 1;
        if (!solve_NTRU(logn, F, G, f, g, lim, (uint32_t *)tmp)) {
            continue;
        }

        /*
         * Key pair is generated.
         */
        break;
    }
}
